2012-07-26 05:19:24 +08:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2012 Alexander Block. All rights reserved.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public
|
|
|
|
* License v2 as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public
|
|
|
|
* License along with this program; if not, write to the
|
|
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
|
|
* Boston, MA 021110-1307, USA.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/bsearch.h>
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/file.h>
|
|
|
|
#include <linux/sort.h>
|
|
|
|
#include <linux/mount.h>
|
|
|
|
#include <linux/xattr.h>
|
|
|
|
#include <linux/posix_acl_xattr.h>
|
|
|
|
#include <linux/radix-tree.h>
|
2012-07-27 08:11:13 +08:00
|
|
|
#include <linux/vmalloc.h>
|
2013-08-21 15:32:13 +08:00
|
|
|
#include <linux/string.h>
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
#include "send.h"
|
|
|
|
#include "backref.h"
|
2014-01-30 05:06:04 +08:00
|
|
|
#include "hash.h"
|
2012-07-26 05:19:24 +08:00
|
|
|
#include "locking.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "btrfs_inode.h"
|
|
|
|
#include "transaction.h"
|
2016-03-10 17:26:59 +08:00
|
|
|
#include "compression.h"
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A fs_path is a helper to dynamically build path names with unknown size.
|
|
|
|
* It reallocates the internal buffer on demand.
|
|
|
|
* It allows fast adding of path elements on the right side (normal path) and
|
|
|
|
* fast adding to the left side (reversed path). A reversed path can also be
|
|
|
|
* unreversed if needed.
|
|
|
|
*/
|
|
|
|
struct fs_path {
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
char *start;
|
|
|
|
char *end;
|
|
|
|
|
|
|
|
char *buf;
|
2014-02-04 02:23:47 +08:00
|
|
|
unsigned short buf_len:15;
|
|
|
|
unsigned short reversed:1;
|
2012-07-26 05:19:24 +08:00
|
|
|
char inline_buf[];
|
|
|
|
};
|
2014-02-05 23:17:34 +08:00
|
|
|
/*
|
|
|
|
* Average path length does not exceed 200 bytes, we'll have
|
|
|
|
* better packing in the slab and higher chance to satisfy
|
|
|
|
* a allocation later during send.
|
|
|
|
*/
|
|
|
|
char pad[256];
|
2012-07-26 05:19:24 +08:00
|
|
|
};
|
|
|
|
};
|
|
|
|
#define FS_PATH_INLINE_SIZE \
|
|
|
|
(sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
|
|
|
|
|
|
|
|
|
|
|
|
/* reused for each extent */
|
|
|
|
struct clone_root {
|
|
|
|
struct btrfs_root *root;
|
|
|
|
u64 ino;
|
|
|
|
u64 offset;
|
|
|
|
|
|
|
|
u64 found_refs;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
|
|
|
|
#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
|
|
|
|
|
|
|
|
struct send_ctx {
|
|
|
|
struct file *send_filp;
|
|
|
|
loff_t send_off;
|
|
|
|
char *send_buf;
|
|
|
|
u32 send_size;
|
|
|
|
u32 send_max_size;
|
|
|
|
u64 total_send_size;
|
|
|
|
u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
|
2013-02-05 04:54:57 +08:00
|
|
|
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
struct btrfs_root *send_root;
|
|
|
|
struct btrfs_root *parent_root;
|
|
|
|
struct clone_root *clone_roots;
|
|
|
|
int clone_roots_cnt;
|
|
|
|
|
|
|
|
/* current state of the compare_tree call */
|
|
|
|
struct btrfs_path *left_path;
|
|
|
|
struct btrfs_path *right_path;
|
|
|
|
struct btrfs_key *cmp_key;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* infos of the currently processed inode. In case of deleted inodes,
|
|
|
|
* these are the values from the deleted inode.
|
|
|
|
*/
|
|
|
|
u64 cur_ino;
|
|
|
|
u64 cur_inode_gen;
|
|
|
|
int cur_inode_new;
|
|
|
|
int cur_inode_new_gen;
|
|
|
|
int cur_inode_deleted;
|
|
|
|
u64 cur_inode_size;
|
|
|
|
u64 cur_inode_mode;
|
2014-02-27 17:29:01 +08:00
|
|
|
u64 cur_inode_rdev;
|
2013-10-23 00:18:51 +08:00
|
|
|
u64 cur_inode_last_extent;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
u64 send_progress;
|
|
|
|
|
|
|
|
struct list_head new_refs;
|
|
|
|
struct list_head deleted_refs;
|
|
|
|
|
|
|
|
struct radix_tree_root name_cache;
|
|
|
|
struct list_head name_cache_list;
|
|
|
|
int name_cache_size;
|
|
|
|
|
2014-03-05 10:07:35 +08:00
|
|
|
struct file_ra_state ra;
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
char *read_buf;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We process inodes by their increasing order, so if before an
|
|
|
|
* incremental send we reverse the parent/child relationship of
|
|
|
|
* directories such that a directory with a lower inode number was
|
|
|
|
* the parent of a directory with a higher inode number, and the one
|
|
|
|
* becoming the new parent got renamed too, we can't rename/move the
|
|
|
|
* directory with lower inode number when we finish processing it - we
|
|
|
|
* must process the directory with higher inode number first, then
|
|
|
|
* rename/move it and then rename/move the directory with lower inode
|
|
|
|
* number. Example follows.
|
|
|
|
*
|
|
|
|
* Tree state when the first send was performed:
|
|
|
|
*
|
|
|
|
* .
|
|
|
|
* |-- a (ino 257)
|
|
|
|
* |-- b (ino 258)
|
|
|
|
* |
|
|
|
|
* |
|
|
|
|
* |-- c (ino 259)
|
|
|
|
* | |-- d (ino 260)
|
|
|
|
* |
|
|
|
|
* |-- c2 (ino 261)
|
|
|
|
*
|
|
|
|
* Tree state when the second (incremental) send is performed:
|
|
|
|
*
|
|
|
|
* .
|
|
|
|
* |-- a (ino 257)
|
|
|
|
* |-- b (ino 258)
|
|
|
|
* |-- c2 (ino 261)
|
|
|
|
* |-- d2 (ino 260)
|
|
|
|
* |-- cc (ino 259)
|
|
|
|
*
|
|
|
|
* The sequence of steps that lead to the second state was:
|
|
|
|
*
|
|
|
|
* mv /a/b/c/d /a/b/c2/d2
|
|
|
|
* mv /a/b/c /a/b/c2/d2/cc
|
|
|
|
*
|
|
|
|
* "c" has lower inode number, but we can't move it (2nd mv operation)
|
|
|
|
* before we move "d", which has higher inode number.
|
|
|
|
*
|
|
|
|
* So we just memorize which move/rename operations must be performed
|
|
|
|
* later when their respective parent is processed and moved/renamed.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Indexed by parent directory inode number. */
|
|
|
|
struct rb_root pending_dir_moves;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reverse index, indexed by the inode number of a directory that
|
|
|
|
* is waiting for the move/rename of its immediate parent before its
|
|
|
|
* own move/rename can be performed.
|
|
|
|
*/
|
|
|
|
struct rb_root waiting_dir_moves;
|
2014-02-19 22:31:44 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A directory that is going to be rm'ed might have a child directory
|
|
|
|
* which is in the pending directory moves index above. In this case,
|
|
|
|
* the directory can only be removed after the move/rename of its child
|
|
|
|
* is performed. Example:
|
|
|
|
*
|
|
|
|
* Parent snapshot:
|
|
|
|
*
|
|
|
|
* . (ino 256)
|
|
|
|
* |-- a/ (ino 257)
|
|
|
|
* |-- b/ (ino 258)
|
|
|
|
* |-- c/ (ino 259)
|
|
|
|
* | |-- x/ (ino 260)
|
|
|
|
* |
|
|
|
|
* |-- y/ (ino 261)
|
|
|
|
*
|
|
|
|
* Send snapshot:
|
|
|
|
*
|
|
|
|
* . (ino 256)
|
|
|
|
* |-- a/ (ino 257)
|
|
|
|
* |-- b/ (ino 258)
|
|
|
|
* |-- YY/ (ino 261)
|
|
|
|
* |-- x/ (ino 260)
|
|
|
|
*
|
|
|
|
* Sequence of steps that lead to the send snapshot:
|
|
|
|
* rm -f /a/b/c/foo.txt
|
|
|
|
* mv /a/b/y /a/b/YY
|
|
|
|
* mv /a/b/c/x /a/b/YY
|
|
|
|
* rmdir /a/b/c
|
|
|
|
*
|
|
|
|
* When the child is processed, its move/rename is delayed until its
|
|
|
|
* parent is processed (as explained above), but all other operations
|
|
|
|
* like update utimes, chown, chgrp, etc, are performed and the paths
|
|
|
|
* that it uses for those operations must use the orphanized name of
|
|
|
|
* its parent (the directory we're going to rm later), so we need to
|
|
|
|
* memorize that name.
|
|
|
|
*
|
|
|
|
* Indexed by the inode number of the directory to be deleted.
|
|
|
|
*/
|
|
|
|
struct rb_root orphan_dirs;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct pending_dir_move {
|
|
|
|
struct rb_node node;
|
|
|
|
struct list_head list;
|
|
|
|
u64 parent_ino;
|
|
|
|
u64 ino;
|
|
|
|
u64 gen;
|
|
|
|
struct list_head update_refs;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct waiting_dir_move {
|
|
|
|
struct rb_node node;
|
|
|
|
u64 ino;
|
2014-02-19 22:31:44 +08:00
|
|
|
/*
|
|
|
|
* There might be some directory that could not be removed because it
|
|
|
|
* was waiting for this directory inode to be moved first. Therefore
|
|
|
|
* after this directory is moved, we can try to rmdir the ino rmdir_ino.
|
|
|
|
*/
|
|
|
|
u64 rmdir_ino;
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
bool orphanized;
|
2014-02-19 22:31:44 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct orphan_dir_info {
|
|
|
|
struct rb_node node;
|
|
|
|
u64 ino;
|
|
|
|
u64 gen;
|
2012-07-26 05:19:24 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct name_cache_entry {
|
|
|
|
struct list_head list;
|
2012-07-28 20:20:58 +08:00
|
|
|
/*
|
|
|
|
* radix_tree has only 32bit entries but we need to handle 64bit inums.
|
|
|
|
* We use the lower 32bit of the 64bit inum to store it in the tree. If
|
|
|
|
* more then one inum would fall into the same entry, we use radix_list
|
|
|
|
* to store the additional entries. radix_list is also used to store
|
|
|
|
* entries where two entries have the same inum but different
|
|
|
|
* generations.
|
|
|
|
*/
|
|
|
|
struct list_head radix_list;
|
2012-07-26 05:19:24 +08:00
|
|
|
u64 ino;
|
|
|
|
u64 gen;
|
|
|
|
u64 parent_ino;
|
|
|
|
u64 parent_gen;
|
|
|
|
int ret;
|
|
|
|
int need_later_update;
|
|
|
|
int name_len;
|
|
|
|
char name[];
|
|
|
|
};
|
|
|
|
|
Btrfs: send, don't bug on inconsistent snapshots
When doing an incremental send, if we find a new/modified/deleted extent,
reference or xattr without having previously processed the corresponding
inode item we end up exexuting a BUG_ON(). This is because whenever an
extent, xattr or reference is added, modified or deleted, we always expect
to have the corresponding inode item updated. However there are situations
where this will not happen due to transient -ENOMEM or -ENOSPC errors when
doing delayed inode updates.
For example, when punching holes we can succeed in deleting and modifying
(shrinking) extents but later fail to do the delayed inode update. So after
such failure we close our transaction handle and right after a snapshot of
the fs/subvol tree can be made and used later for a send operation. The
same thing can happen during truncate, link, unlink, and xattr related
operations.
So instead of executing a BUG_ON, make send return an -EIO error and print
an informative error message do dmesg/syslog.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2016-08-01 08:50:37 +08:00
|
|
|
static void inconsistent_snapshot_error(struct send_ctx *sctx,
|
|
|
|
enum btrfs_compare_tree_result result,
|
|
|
|
const char *what)
|
|
|
|
{
|
|
|
|
const char *result_string;
|
|
|
|
|
|
|
|
switch (result) {
|
|
|
|
case BTRFS_COMPARE_TREE_NEW:
|
|
|
|
result_string = "new";
|
|
|
|
break;
|
|
|
|
case BTRFS_COMPARE_TREE_DELETED:
|
|
|
|
result_string = "deleted";
|
|
|
|
break;
|
|
|
|
case BTRFS_COMPARE_TREE_CHANGED:
|
|
|
|
result_string = "updated";
|
|
|
|
break;
|
|
|
|
case BTRFS_COMPARE_TREE_SAME:
|
|
|
|
ASSERT(0);
|
|
|
|
result_string = "unchanged";
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
ASSERT(0);
|
|
|
|
result_string = "unexpected";
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_err(sctx->send_root->fs_info,
|
|
|
|
"Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
|
|
|
|
result_string, what, sctx->cmp_key->objectid,
|
|
|
|
sctx->send_root->root_key.objectid,
|
|
|
|
(sctx->parent_root ?
|
|
|
|
sctx->parent_root->root_key.objectid : 0));
|
|
|
|
}
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
|
|
|
|
|
2014-02-19 22:31:44 +08:00
|
|
|
static struct waiting_dir_move *
|
|
|
|
get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
|
|
|
|
|
|
|
|
static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
|
|
|
|
|
2013-10-23 00:18:51 +08:00
|
|
|
static int need_send_hole(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
return (sctx->parent_root && !sctx->cur_inode_new &&
|
|
|
|
!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
|
|
|
|
S_ISREG(sctx->cur_inode_mode));
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
static void fs_path_reset(struct fs_path *p)
|
|
|
|
{
|
|
|
|
if (p->reversed) {
|
|
|
|
p->start = p->buf + p->buf_len - 1;
|
|
|
|
p->end = p->start;
|
|
|
|
*p->start = 0;
|
|
|
|
} else {
|
|
|
|
p->start = p->buf;
|
|
|
|
p->end = p->start;
|
|
|
|
*p->start = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
static struct fs_path *fs_path_alloc(void)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
struct fs_path *p;
|
|
|
|
|
2016-01-19 01:42:13 +08:00
|
|
|
p = kmalloc(sizeof(*p), GFP_KERNEL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return NULL;
|
|
|
|
p->reversed = 0;
|
|
|
|
p->buf = p->inline_buf;
|
|
|
|
p->buf_len = FS_PATH_INLINE_SIZE;
|
|
|
|
fs_path_reset(p);
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
static struct fs_path *fs_path_alloc_reversed(void)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
struct fs_path *p;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return NULL;
|
|
|
|
p->reversed = 1;
|
|
|
|
fs_path_reset(p);
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
static void fs_path_free(struct fs_path *p)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
if (!p)
|
|
|
|
return;
|
2014-02-05 23:17:34 +08:00
|
|
|
if (p->buf != p->inline_buf)
|
|
|
|
kfree(p->buf);
|
2012-07-26 05:19:24 +08:00
|
|
|
kfree(p);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int fs_path_len(struct fs_path *p)
|
|
|
|
{
|
|
|
|
return p->end - p->start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int fs_path_ensure_buf(struct fs_path *p, int len)
|
|
|
|
{
|
|
|
|
char *tmp_buf;
|
|
|
|
int path_len;
|
|
|
|
int old_buf_len;
|
|
|
|
|
|
|
|
len++;
|
|
|
|
|
|
|
|
if (p->buf_len >= len)
|
|
|
|
return 0;
|
|
|
|
|
2014-04-26 20:02:03 +08:00
|
|
|
if (len > PATH_MAX) {
|
|
|
|
WARN_ON(1);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2014-02-26 02:32:59 +08:00
|
|
|
path_len = p->end - p->start;
|
|
|
|
old_buf_len = p->buf_len;
|
|
|
|
|
2014-02-05 23:17:34 +08:00
|
|
|
/*
|
|
|
|
* First time the inline_buf does not suffice
|
|
|
|
*/
|
2014-05-22 00:38:13 +08:00
|
|
|
if (p->buf == p->inline_buf) {
|
2016-01-19 01:42:13 +08:00
|
|
|
tmp_buf = kmalloc(len, GFP_KERNEL);
|
2014-05-22 00:38:13 +08:00
|
|
|
if (tmp_buf)
|
|
|
|
memcpy(tmp_buf, p->buf, old_buf_len);
|
|
|
|
} else {
|
2016-01-19 01:42:13 +08:00
|
|
|
tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
|
2014-05-22 00:38:13 +08:00
|
|
|
}
|
2014-02-26 02:33:08 +08:00
|
|
|
if (!tmp_buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
p->buf = tmp_buf;
|
|
|
|
/*
|
|
|
|
* The real size of the buffer is bigger, this will let the fast path
|
|
|
|
* happen most of the time
|
|
|
|
*/
|
|
|
|
p->buf_len = ksize(p->buf);
|
2014-02-05 23:17:34 +08:00
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
if (p->reversed) {
|
|
|
|
tmp_buf = p->buf + old_buf_len - path_len - 1;
|
|
|
|
p->end = p->buf + p->buf_len - 1;
|
|
|
|
p->start = p->end - path_len;
|
|
|
|
memmove(p->start, tmp_buf, path_len + 1);
|
|
|
|
} else {
|
|
|
|
p->start = p->buf;
|
|
|
|
p->end = p->start + path_len;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-02-04 02:23:19 +08:00
|
|
|
static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
|
|
|
|
char **prepared)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int new_len;
|
|
|
|
|
|
|
|
new_len = p->end - p->start + name_len;
|
|
|
|
if (p->start != p->end)
|
|
|
|
new_len++;
|
|
|
|
ret = fs_path_ensure_buf(p, new_len);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (p->reversed) {
|
|
|
|
if (p->start != p->end)
|
|
|
|
*--p->start = '/';
|
|
|
|
p->start -= name_len;
|
2014-02-04 02:23:19 +08:00
|
|
|
*prepared = p->start;
|
2012-07-26 05:19:24 +08:00
|
|
|
} else {
|
|
|
|
if (p->start != p->end)
|
|
|
|
*p->end++ = '/';
|
2014-02-04 02:23:19 +08:00
|
|
|
*prepared = p->end;
|
2012-07-26 05:19:24 +08:00
|
|
|
p->end += name_len;
|
|
|
|
*p->end = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int fs_path_add(struct fs_path *p, const char *name, int name_len)
|
|
|
|
{
|
|
|
|
int ret;
|
2014-02-04 02:23:19 +08:00
|
|
|
char *prepared;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2014-02-04 02:23:19 +08:00
|
|
|
ret = fs_path_prepare_for_add(p, name_len, &prepared);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2014-02-04 02:23:19 +08:00
|
|
|
memcpy(prepared, name, name_len);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
|
|
|
|
{
|
|
|
|
int ret;
|
2014-02-04 02:23:19 +08:00
|
|
|
char *prepared;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2014-02-04 02:23:19 +08:00
|
|
|
ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2014-02-04 02:23:19 +08:00
|
|
|
memcpy(prepared, p2->start, p2->end - p2->start);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int fs_path_add_from_extent_buffer(struct fs_path *p,
|
|
|
|
struct extent_buffer *eb,
|
|
|
|
unsigned long off, int len)
|
|
|
|
{
|
|
|
|
int ret;
|
2014-02-04 02:23:19 +08:00
|
|
|
char *prepared;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2014-02-04 02:23:19 +08:00
|
|
|
ret = fs_path_prepare_for_add(p, len, &prepared);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2014-02-04 02:23:19 +08:00
|
|
|
read_extent_buffer(eb, prepared, off, len);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
p->reversed = from->reversed;
|
|
|
|
fs_path_reset(p);
|
|
|
|
|
|
|
|
ret = fs_path_add_path(p, from);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void fs_path_unreverse(struct fs_path *p)
|
|
|
|
{
|
|
|
|
char *tmp;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (!p->reversed)
|
|
|
|
return;
|
|
|
|
|
|
|
|
tmp = p->start;
|
|
|
|
len = p->end - p->start;
|
|
|
|
p->start = p->buf;
|
|
|
|
p->end = p->start + len;
|
|
|
|
memmove(p->start, tmp, len + 1);
|
|
|
|
p->reversed = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct btrfs_path *alloc_path_for_send(void)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return NULL;
|
|
|
|
path->search_commit_root = 1;
|
|
|
|
path->skip_locking = 1;
|
2014-03-29 05:16:01 +08:00
|
|
|
path->need_commit_sem = 1;
|
2012-07-26 05:19:24 +08:00
|
|
|
return path;
|
|
|
|
}
|
|
|
|
|
2013-04-26 04:41:01 +08:00
|
|
|
static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
mm_segment_t old_fs;
|
|
|
|
u32 pos = 0;
|
|
|
|
|
|
|
|
old_fs = get_fs();
|
|
|
|
set_fs(KERNEL_DS);
|
|
|
|
|
|
|
|
while (pos < len) {
|
2014-07-16 03:17:17 +08:00
|
|
|
ret = vfs_write(filp, (__force const char __user *)buf + pos,
|
|
|
|
len - pos, off);
|
2012-07-26 05:19:24 +08:00
|
|
|
/* TODO handle that correctly */
|
|
|
|
/*if (ret == -ERESTARTSYS) {
|
|
|
|
continue;
|
|
|
|
}*/
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret == 0) {
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
pos += ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
set_fs(old_fs);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
|
|
|
|
{
|
|
|
|
struct btrfs_tlv_header *hdr;
|
|
|
|
int total_len = sizeof(*hdr) + len;
|
|
|
|
int left = sctx->send_max_size - sctx->send_size;
|
|
|
|
|
|
|
|
if (unlikely(left < total_len))
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
|
|
|
hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
|
|
|
|
hdr->tlv_type = cpu_to_le16(attr);
|
|
|
|
hdr->tlv_len = cpu_to_le16(len);
|
|
|
|
memcpy(hdr + 1, data, len);
|
|
|
|
sctx->send_size += total_len;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-12-17 00:34:10 +08:00
|
|
|
#define TLV_PUT_DEFINE_INT(bits) \
|
|
|
|
static int tlv_put_u##bits(struct send_ctx *sctx, \
|
|
|
|
u##bits attr, u##bits value) \
|
|
|
|
{ \
|
|
|
|
__le##bits __tmp = cpu_to_le##bits(value); \
|
|
|
|
return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-12-17 00:34:10 +08:00
|
|
|
TLV_PUT_DEFINE_INT(64)
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
static int tlv_put_string(struct send_ctx *sctx, u16 attr,
|
|
|
|
const char *str, int len)
|
|
|
|
{
|
|
|
|
if (len == -1)
|
|
|
|
len = strlen(str);
|
|
|
|
return tlv_put(sctx, attr, str, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
|
|
|
|
const u8 *uuid)
|
|
|
|
{
|
|
|
|
return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
|
|
|
|
struct extent_buffer *eb,
|
|
|
|
struct btrfs_timespec *ts)
|
|
|
|
{
|
|
|
|
struct btrfs_timespec bts;
|
|
|
|
read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
|
|
|
|
return tlv_put(sctx, attr, &bts, sizeof(bts));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#define TLV_PUT(sctx, attrtype, attrlen, data) \
|
|
|
|
do { \
|
|
|
|
ret = tlv_put(sctx, attrtype, attrlen, data); \
|
|
|
|
if (ret < 0) \
|
|
|
|
goto tlv_put_failure; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define TLV_PUT_INT(sctx, attrtype, bits, value) \
|
|
|
|
do { \
|
|
|
|
ret = tlv_put_u##bits(sctx, attrtype, value); \
|
|
|
|
if (ret < 0) \
|
|
|
|
goto tlv_put_failure; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
|
|
|
|
#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
|
|
|
|
#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
|
|
|
|
#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
|
|
|
|
#define TLV_PUT_STRING(sctx, attrtype, str, len) \
|
|
|
|
do { \
|
|
|
|
ret = tlv_put_string(sctx, attrtype, str, len); \
|
|
|
|
if (ret < 0) \
|
|
|
|
goto tlv_put_failure; \
|
|
|
|
} while (0)
|
|
|
|
#define TLV_PUT_PATH(sctx, attrtype, p) \
|
|
|
|
do { \
|
|
|
|
ret = tlv_put_string(sctx, attrtype, p->start, \
|
|
|
|
p->end - p->start); \
|
|
|
|
if (ret < 0) \
|
|
|
|
goto tlv_put_failure; \
|
|
|
|
} while(0)
|
|
|
|
#define TLV_PUT_UUID(sctx, attrtype, uuid) \
|
|
|
|
do { \
|
|
|
|
ret = tlv_put_uuid(sctx, attrtype, uuid); \
|
|
|
|
if (ret < 0) \
|
|
|
|
goto tlv_put_failure; \
|
|
|
|
} while (0)
|
|
|
|
#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
|
|
|
|
do { \
|
|
|
|
ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
|
|
|
|
if (ret < 0) \
|
|
|
|
goto tlv_put_failure; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
static int send_header(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
struct btrfs_stream_header hdr;
|
|
|
|
|
|
|
|
strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
|
|
|
|
hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
|
|
|
|
|
2012-09-14 14:04:21 +08:00
|
|
|
return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
|
|
|
|
&sctx->send_off);
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For each command/item we want to send to userspace, we call this function.
|
|
|
|
*/
|
|
|
|
static int begin_cmd(struct send_ctx *sctx, int cmd)
|
|
|
|
{
|
|
|
|
struct btrfs_cmd_header *hdr;
|
|
|
|
|
2013-10-31 13:00:08 +08:00
|
|
|
if (WARN_ON(!sctx->send_buf))
|
2012-07-26 05:19:24 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
BUG_ON(sctx->send_size);
|
|
|
|
|
|
|
|
sctx->send_size += sizeof(*hdr);
|
|
|
|
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
|
|
|
|
hdr->cmd = cpu_to_le16(cmd);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int send_cmd(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_cmd_header *hdr;
|
|
|
|
u32 crc;
|
|
|
|
|
|
|
|
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
|
|
|
|
hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
|
|
|
|
hdr->crc = 0;
|
|
|
|
|
2014-01-30 05:06:04 +08:00
|
|
|
crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
|
2012-07-26 05:19:24 +08:00
|
|
|
hdr->crc = cpu_to_le32(crc);
|
|
|
|
|
2012-09-14 14:04:21 +08:00
|
|
|
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
|
|
|
|
&sctx->send_off);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
sctx->total_send_size += sctx->send_size;
|
|
|
|
sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
|
|
|
|
sctx->send_size = 0;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sends a move instruction to user space
|
|
|
|
*/
|
|
|
|
static int send_rename(struct send_ctx *sctx,
|
|
|
|
struct fs_path *from, struct fs_path *to)
|
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret;
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sends a link instruction to user space
|
|
|
|
*/
|
|
|
|
static int send_link(struct send_ctx *sctx,
|
|
|
|
struct fs_path *path, struct fs_path *lnk)
|
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret;
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sends an unlink instruction to user space
|
|
|
|
*/
|
|
|
|
static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
|
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret;
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "send_unlink %s", path->start);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sends a rmdir instruction to user space
|
|
|
|
*/
|
|
|
|
static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
|
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret;
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "send_rmdir %s", path->start);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function to retrieve some fields from an inode item.
|
|
|
|
*/
|
2014-03-29 05:16:01 +08:00
|
|
|
static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
|
|
|
|
u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
|
|
|
|
u64 *gid, u64 *rdev)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_inode_item *ii;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
|
|
|
key.objectid = ino;
|
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret) {
|
2014-03-29 05:16:01 +08:00
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
return ret;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_inode_item);
|
|
|
|
if (size)
|
|
|
|
*size = btrfs_inode_size(path->nodes[0], ii);
|
|
|
|
if (gen)
|
|
|
|
*gen = btrfs_inode_generation(path->nodes[0], ii);
|
|
|
|
if (mode)
|
|
|
|
*mode = btrfs_inode_mode(path->nodes[0], ii);
|
|
|
|
if (uid)
|
|
|
|
*uid = btrfs_inode_uid(path->nodes[0], ii);
|
|
|
|
if (gid)
|
|
|
|
*gid = btrfs_inode_gid(path->nodes[0], ii);
|
2012-07-27 05:39:10 +08:00
|
|
|
if (rdev)
|
|
|
|
*rdev = btrfs_inode_rdev(path->nodes[0], ii);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2014-03-29 05:16:01 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int get_inode_info(struct btrfs_root *root,
|
|
|
|
u64 ino, u64 *size, u64 *gen,
|
|
|
|
u64 *mode, u64 *uid, u64 *gid,
|
|
|
|
u64 *rdev)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
|
|
|
|
rdev);
|
2012-07-26 05:19:24 +08:00
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
|
|
|
|
struct fs_path *p,
|
|
|
|
void *ctx);
|
|
|
|
|
|
|
|
/*
|
2012-10-15 16:30:45 +08:00
|
|
|
* Helper function to iterate the entries in ONE btrfs_inode_ref or
|
|
|
|
* btrfs_inode_extref.
|
2012-07-26 05:19:24 +08:00
|
|
|
* The iterate callback may return a non zero value to stop iteration. This can
|
|
|
|
* be a negative value for error codes or 1 to simply stop it.
|
|
|
|
*
|
2012-10-15 16:30:45 +08:00
|
|
|
* path must point to the INODE_REF or INODE_EXTREF when called.
|
2012-07-26 05:19:24 +08:00
|
|
|
*/
|
2013-05-08 15:51:52 +08:00
|
|
|
static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
|
2012-07-26 05:19:24 +08:00
|
|
|
struct btrfs_key *found_key, int resolve,
|
|
|
|
iterate_inode_ref_t iterate, void *ctx)
|
|
|
|
{
|
2012-10-15 16:30:45 +08:00
|
|
|
struct extent_buffer *eb = path->nodes[0];
|
2012-07-26 05:19:24 +08:00
|
|
|
struct btrfs_item *item;
|
|
|
|
struct btrfs_inode_ref *iref;
|
2012-10-15 16:30:45 +08:00
|
|
|
struct btrfs_inode_extref *extref;
|
2012-07-26 05:19:24 +08:00
|
|
|
struct btrfs_path *tmp_path;
|
|
|
|
struct fs_path *p;
|
2012-10-15 16:30:45 +08:00
|
|
|
u32 cur = 0;
|
2012-07-26 05:19:24 +08:00
|
|
|
u32 total;
|
2012-10-15 16:30:45 +08:00
|
|
|
int slot = path->slots[0];
|
2012-07-26 05:19:24 +08:00
|
|
|
u32 name_len;
|
|
|
|
char *start;
|
|
|
|
int ret = 0;
|
2012-10-15 16:30:45 +08:00
|
|
|
int num = 0;
|
2012-07-26 05:19:24 +08:00
|
|
|
int index;
|
2012-10-15 16:30:45 +08:00
|
|
|
u64 dir;
|
|
|
|
unsigned long name_off;
|
|
|
|
unsigned long elem_size;
|
|
|
|
unsigned long ptr;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc_reversed();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
tmp_path = alloc_path_for_send();
|
|
|
|
if (!tmp_path) {
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-10-15 16:30:45 +08:00
|
|
|
if (found_key->type == BTRFS_INODE_REF_KEY) {
|
|
|
|
ptr = (unsigned long)btrfs_item_ptr(eb, slot,
|
|
|
|
struct btrfs_inode_ref);
|
2013-09-16 22:58:09 +08:00
|
|
|
item = btrfs_item_nr(slot);
|
2012-10-15 16:30:45 +08:00
|
|
|
total = btrfs_item_size(eb, item);
|
|
|
|
elem_size = sizeof(*iref);
|
|
|
|
} else {
|
|
|
|
ptr = btrfs_item_ptr_offset(eb, slot);
|
|
|
|
total = btrfs_item_size_nr(eb, slot);
|
|
|
|
elem_size = sizeof(*extref);
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
while (cur < total) {
|
|
|
|
fs_path_reset(p);
|
|
|
|
|
2012-10-15 16:30:45 +08:00
|
|
|
if (found_key->type == BTRFS_INODE_REF_KEY) {
|
|
|
|
iref = (struct btrfs_inode_ref *)(ptr + cur);
|
|
|
|
name_len = btrfs_inode_ref_name_len(eb, iref);
|
|
|
|
name_off = (unsigned long)(iref + 1);
|
|
|
|
index = btrfs_inode_ref_index(eb, iref);
|
|
|
|
dir = found_key->offset;
|
|
|
|
} else {
|
|
|
|
extref = (struct btrfs_inode_extref *)(ptr + cur);
|
|
|
|
name_len = btrfs_inode_extref_name_len(eb, extref);
|
|
|
|
name_off = (unsigned long)&extref->name;
|
|
|
|
index = btrfs_inode_extref_index(eb, extref);
|
|
|
|
dir = btrfs_inode_extref_parent(eb, extref);
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
if (resolve) {
|
2012-10-15 16:30:45 +08:00
|
|
|
start = btrfs_ref_to_path(root, tmp_path, name_len,
|
|
|
|
name_off, eb, dir,
|
|
|
|
p->buf, p->buf_len);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (IS_ERR(start)) {
|
|
|
|
ret = PTR_ERR(start);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (start < p->buf) {
|
|
|
|
/* overflow , try again with larger buffer */
|
|
|
|
ret = fs_path_ensure_buf(p,
|
|
|
|
p->buf_len + p->buf - start);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-10-15 16:30:45 +08:00
|
|
|
start = btrfs_ref_to_path(root, tmp_path,
|
|
|
|
name_len, name_off,
|
|
|
|
eb, dir,
|
|
|
|
p->buf, p->buf_len);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (IS_ERR(start)) {
|
|
|
|
ret = PTR_ERR(start);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
BUG_ON(start < p->buf);
|
|
|
|
}
|
|
|
|
p->start = start;
|
|
|
|
} else {
|
2012-10-15 16:30:45 +08:00
|
|
|
ret = fs_path_add_from_extent_buffer(p, eb, name_off,
|
|
|
|
name_len);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-10-15 16:30:45 +08:00
|
|
|
cur += elem_size + name_len;
|
|
|
|
ret = iterate(num, dir, index, p, ctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
num++;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(tmp_path);
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
|
|
|
|
const char *name, int name_len,
|
|
|
|
const char *data, int data_len,
|
|
|
|
u8 type, void *ctx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function to iterate the entries in ONE btrfs_dir_item.
|
|
|
|
* The iterate callback may return a non zero value to stop iteration. This can
|
|
|
|
* be a negative value for error codes or 1 to simply stop it.
|
|
|
|
*
|
|
|
|
* path must point to the dir item when called.
|
|
|
|
*/
|
2013-05-08 15:51:52 +08:00
|
|
|
static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
|
2012-07-26 05:19:24 +08:00
|
|
|
struct btrfs_key *found_key,
|
|
|
|
iterate_dir_item_t iterate, void *ctx)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
struct btrfs_item *item;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_key di_key;
|
|
|
|
char *buf = NULL;
|
2014-05-24 03:15:16 +08:00
|
|
|
int buf_len;
|
2012-07-26 05:19:24 +08:00
|
|
|
u32 name_len;
|
|
|
|
u32 data_len;
|
|
|
|
u32 cur;
|
|
|
|
u32 len;
|
|
|
|
u32 total;
|
|
|
|
int slot;
|
|
|
|
int num;
|
|
|
|
u8 type;
|
|
|
|
|
2014-08-20 17:45:45 +08:00
|
|
|
/*
|
|
|
|
* Start with a small buffer (1 page). If later we end up needing more
|
|
|
|
* space, which can happen for xattrs on a fs with a leaf size greater
|
|
|
|
* then the page size, attempt to increase the buffer. Typically xattr
|
|
|
|
* values are small.
|
|
|
|
*/
|
|
|
|
buf_len = PATH_MAX;
|
2016-01-19 01:42:13 +08:00
|
|
|
buf = kmalloc(buf_len, GFP_KERNEL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!buf) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
2013-09-16 22:58:09 +08:00
|
|
|
item = btrfs_item_nr(slot);
|
2012-07-26 05:19:24 +08:00
|
|
|
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
|
|
|
|
cur = 0;
|
|
|
|
len = 0;
|
|
|
|
total = btrfs_item_size(eb, item);
|
|
|
|
|
|
|
|
num = 0;
|
|
|
|
while (cur < total) {
|
|
|
|
name_len = btrfs_dir_name_len(eb, di);
|
|
|
|
data_len = btrfs_dir_data_len(eb, di);
|
|
|
|
type = btrfs_dir_type(eb, di);
|
|
|
|
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
|
|
|
|
|
2014-05-24 03:15:16 +08:00
|
|
|
if (type == BTRFS_FT_XATTR) {
|
|
|
|
if (name_len > XATTR_NAME_MAX) {
|
|
|
|
ret = -ENAMETOOLONG;
|
|
|
|
goto out;
|
|
|
|
}
|
2016-06-15 21:22:56 +08:00
|
|
|
if (name_len + data_len >
|
|
|
|
BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
|
2014-05-24 03:15:16 +08:00
|
|
|
ret = -E2BIG;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Path too long
|
|
|
|
*/
|
2014-08-20 17:45:45 +08:00
|
|
|
if (name_len + data_len > PATH_MAX) {
|
2014-05-24 03:15:16 +08:00
|
|
|
ret = -ENAMETOOLONG;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
2014-08-20 17:45:45 +08:00
|
|
|
if (name_len + data_len > buf_len) {
|
|
|
|
buf_len = name_len + data_len;
|
|
|
|
if (is_vmalloc_addr(buf)) {
|
|
|
|
vfree(buf);
|
|
|
|
buf = NULL;
|
|
|
|
} else {
|
|
|
|
char *tmp = krealloc(buf, buf_len,
|
2016-01-19 01:42:13 +08:00
|
|
|
GFP_KERNEL | __GFP_NOWARN);
|
2014-08-20 17:45:45 +08:00
|
|
|
|
|
|
|
if (!tmp)
|
|
|
|
kfree(buf);
|
|
|
|
buf = tmp;
|
|
|
|
}
|
|
|
|
if (!buf) {
|
|
|
|
buf = vmalloc(buf_len);
|
|
|
|
if (!buf) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
|
|
|
|
name_len + data_len);
|
|
|
|
|
|
|
|
len = sizeof(*di) + name_len + data_len;
|
|
|
|
di = (struct btrfs_dir_item *)((char *)di + len);
|
|
|
|
cur += len;
|
|
|
|
|
|
|
|
ret = iterate(num, &di_key, buf, name_len, buf + name_len,
|
|
|
|
data_len, type, ctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
num++;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2014-08-20 17:45:45 +08:00
|
|
|
kvfree(buf);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __copy_first_ref(int num, u64 dir, int index,
|
|
|
|
struct fs_path *p, void *ctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct fs_path *pt = ctx;
|
|
|
|
|
|
|
|
ret = fs_path_copy(pt, p);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/* we want the first only */
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Retrieve the first path of an inode. If an inode has more then one
|
|
|
|
* ref/hardlink, this is ignored.
|
|
|
|
*/
|
2013-05-08 15:51:52 +08:00
|
|
|
static int get_inode_path(struct btrfs_root *root,
|
2012-07-26 05:19:24 +08:00
|
|
|
u64 ino, struct fs_path *path)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_key key, found_key;
|
|
|
|
struct btrfs_path *p;
|
|
|
|
|
|
|
|
p = alloc_path_for_send();
|
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
fs_path_reset(path);
|
|
|
|
|
|
|
|
key.objectid = ino;
|
|
|
|
key.type = BTRFS_INODE_REF_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
ret = 1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
|
|
|
|
if (found_key.objectid != ino ||
|
2012-10-15 16:30:45 +08:00
|
|
|
(found_key.type != BTRFS_INODE_REF_KEY &&
|
|
|
|
found_key.type != BTRFS_INODE_EXTREF_KEY)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_inode_ref(root, p, &found_key, 1,
|
|
|
|
__copy_first_ref, path);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(p);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct backref_ctx {
|
|
|
|
struct send_ctx *sctx;
|
|
|
|
|
2014-03-29 05:16:01 +08:00
|
|
|
struct btrfs_path *path;
|
2012-07-26 05:19:24 +08:00
|
|
|
/* number of total found references */
|
|
|
|
u64 found;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* used for clones found in send_root. clones found behind cur_objectid
|
|
|
|
* and cur_offset are not considered as allowed clones.
|
|
|
|
*/
|
|
|
|
u64 cur_objectid;
|
|
|
|
u64 cur_offset;
|
|
|
|
|
|
|
|
/* may be truncated in case it's the last extent in a file */
|
|
|
|
u64 extent_len;
|
|
|
|
|
2015-05-03 08:56:00 +08:00
|
|
|
/* data offset in the file extent item */
|
|
|
|
u64 data_offset;
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
/* Just to check for bugs in backref resolving */
|
2012-07-28 18:42:05 +08:00
|
|
|
int found_itself;
|
2012-07-26 05:19:24 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static int __clone_root_cmp_bsearch(const void *key, const void *elt)
|
|
|
|
{
|
2012-08-13 16:52:38 +08:00
|
|
|
u64 root = (u64)(uintptr_t)key;
|
2012-07-26 05:19:24 +08:00
|
|
|
struct clone_root *cr = (struct clone_root *)elt;
|
|
|
|
|
|
|
|
if (root < cr->root->objectid)
|
|
|
|
return -1;
|
|
|
|
if (root > cr->root->objectid)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __clone_root_cmp_sort(const void *e1, const void *e2)
|
|
|
|
{
|
|
|
|
struct clone_root *cr1 = (struct clone_root *)e1;
|
|
|
|
struct clone_root *cr2 = (struct clone_root *)e2;
|
|
|
|
|
|
|
|
if (cr1->root->objectid < cr2->root->objectid)
|
|
|
|
return -1;
|
|
|
|
if (cr1->root->objectid > cr2->root->objectid)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called for every backref that is found for the current extent.
|
2012-07-28 20:11:31 +08:00
|
|
|
* Results are collected in sctx->clone_roots->ino/offset/found_refs
|
2012-07-26 05:19:24 +08:00
|
|
|
*/
|
|
|
|
static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
|
|
|
|
{
|
|
|
|
struct backref_ctx *bctx = ctx_;
|
|
|
|
struct clone_root *found;
|
|
|
|
int ret;
|
|
|
|
u64 i_size;
|
|
|
|
|
|
|
|
/* First check if the root is in the list of accepted clone sources */
|
2012-08-13 16:52:38 +08:00
|
|
|
found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
|
2012-07-26 05:19:24 +08:00
|
|
|
bctx->sctx->clone_roots_cnt,
|
|
|
|
sizeof(struct clone_root),
|
|
|
|
__clone_root_cmp_bsearch);
|
|
|
|
if (!found)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (found->root == bctx->sctx->send_root &&
|
|
|
|
ino == bctx->cur_objectid &&
|
|
|
|
offset == bctx->cur_offset) {
|
2012-07-28 18:42:05 +08:00
|
|
|
bctx->found_itself = 1;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2012-07-28 20:11:31 +08:00
|
|
|
* There are inodes that have extents that lie behind its i_size. Don't
|
2012-07-26 05:19:24 +08:00
|
|
|
* accept clones from these extents.
|
|
|
|
*/
|
2014-03-29 05:16:01 +08:00
|
|
|
ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
|
|
|
|
NULL, NULL, NULL);
|
|
|
|
btrfs_release_path(bctx->path);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
2015-05-03 08:56:00 +08:00
|
|
|
if (offset + bctx->data_offset + bctx->extent_len > i_size)
|
2012-07-26 05:19:24 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure we don't consider clones from send_root that are
|
|
|
|
* behind the current inode/offset.
|
|
|
|
*/
|
|
|
|
if (found->root == bctx->sctx->send_root) {
|
|
|
|
/*
|
|
|
|
* TODO for the moment we don't accept clones from the inode
|
|
|
|
* that is currently send. We may change this when
|
|
|
|
* BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
|
|
|
|
* file.
|
|
|
|
*/
|
|
|
|
if (ino >= bctx->cur_objectid)
|
|
|
|
return 0;
|
2012-07-28 22:33:49 +08:00
|
|
|
#if 0
|
|
|
|
if (ino > bctx->cur_objectid)
|
|
|
|
return 0;
|
|
|
|
if (offset + bctx->extent_len > bctx->cur_offset)
|
2012-07-26 05:19:24 +08:00
|
|
|
return 0;
|
2012-07-28 22:33:49 +08:00
|
|
|
#endif
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bctx->found++;
|
|
|
|
found->found_refs++;
|
|
|
|
if (ino < found->ino) {
|
|
|
|
found->ino = ino;
|
|
|
|
found->offset = offset;
|
|
|
|
} else if (found->ino == ino) {
|
|
|
|
/*
|
|
|
|
* same extent found more then once in the same file.
|
|
|
|
*/
|
|
|
|
if (found->offset > offset + bctx->extent_len)
|
|
|
|
found->offset = offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2012-07-28 20:11:31 +08:00
|
|
|
* Given an inode, offset and extent item, it finds a good clone for a clone
|
|
|
|
* instruction. Returns -ENOENT when none could be found. The function makes
|
|
|
|
* sure that the returned clone is usable at the point where sending is at the
|
|
|
|
* moment. This means, that no clones are accepted which lie behind the current
|
|
|
|
* inode+offset.
|
|
|
|
*
|
2012-07-26 05:19:24 +08:00
|
|
|
* path must point to the extent item when called.
|
|
|
|
*/
|
|
|
|
static int find_extent_clone(struct send_ctx *sctx,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
u64 ino, u64 data_offset,
|
|
|
|
u64 ino_size,
|
|
|
|
struct clone_root **found)
|
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret;
|
|
|
|
int extent_type;
|
|
|
|
u64 logical;
|
2012-08-08 04:25:13 +08:00
|
|
|
u64 disk_byte;
|
2012-07-26 05:19:24 +08:00
|
|
|
u64 num_bytes;
|
|
|
|
u64 extent_item_pos;
|
2012-09-08 10:01:28 +08:00
|
|
|
u64 flags = 0;
|
2012-07-26 05:19:24 +08:00
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
struct extent_buffer *eb = path->nodes[0];
|
2012-07-28 18:44:34 +08:00
|
|
|
struct backref_ctx *backref_ctx = NULL;
|
2012-07-26 05:19:24 +08:00
|
|
|
struct clone_root *cur_clone_root;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_path *tmp_path;
|
2012-08-08 04:25:13 +08:00
|
|
|
int compressed;
|
2012-07-26 05:19:24 +08:00
|
|
|
u32 i;
|
|
|
|
|
|
|
|
tmp_path = alloc_path_for_send();
|
|
|
|
if (!tmp_path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2014-03-29 05:16:01 +08:00
|
|
|
/* We only use this path under the commit sem */
|
|
|
|
tmp_path->need_commit_sem = 0;
|
|
|
|
|
2016-01-19 01:42:13 +08:00
|
|
|
backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
|
2012-07-28 18:44:34 +08:00
|
|
|
if (!backref_ctx) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2014-03-29 05:16:01 +08:00
|
|
|
backref_ctx->path = tmp_path;
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
if (data_offset >= ino_size) {
|
|
|
|
/*
|
|
|
|
* There may be extents that lie behind the file's size.
|
|
|
|
* I at least had this in combination with snapshotting while
|
|
|
|
* writing large files.
|
|
|
|
*/
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(eb, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
extent_type = btrfs_file_extent_type(eb, fi);
|
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-08-08 04:25:13 +08:00
|
|
|
compressed = btrfs_file_extent_compression(eb, fi);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
num_bytes = btrfs_file_extent_num_bytes(eb, fi);
|
2012-08-08 04:25:13 +08:00
|
|
|
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
|
|
|
|
if (disk_byte == 0) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-08-08 04:25:13 +08:00
|
|
|
logical = disk_byte + btrfs_file_extent_offset(eb, fi);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
down_read(&fs_info->commit_root_sem);
|
|
|
|
ret = extent_from_logical(fs_info, disk_byte, tmp_path,
|
2012-09-08 10:01:28 +08:00
|
|
|
&found_key, &flags);
|
2016-09-20 22:05:03 +08:00
|
|
|
up_read(&fs_info->commit_root_sem);
|
2012-07-26 05:19:24 +08:00
|
|
|
btrfs_release_path(tmp_path);
|
|
|
|
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-09-08 10:01:28 +08:00
|
|
|
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Setup the clone roots.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < sctx->clone_roots_cnt; i++) {
|
|
|
|
cur_clone_root = sctx->clone_roots + i;
|
|
|
|
cur_clone_root->ino = (u64)-1;
|
|
|
|
cur_clone_root->offset = 0;
|
|
|
|
cur_clone_root->found_refs = 0;
|
|
|
|
}
|
|
|
|
|
2012-07-28 18:44:34 +08:00
|
|
|
backref_ctx->sctx = sctx;
|
|
|
|
backref_ctx->found = 0;
|
|
|
|
backref_ctx->cur_objectid = ino;
|
|
|
|
backref_ctx->cur_offset = data_offset;
|
|
|
|
backref_ctx->found_itself = 0;
|
|
|
|
backref_ctx->extent_len = num_bytes;
|
2015-05-03 08:56:00 +08:00
|
|
|
/*
|
|
|
|
* For non-compressed extents iterate_extent_inodes() gives us extent
|
|
|
|
* offsets that already take into account the data offset, but not for
|
|
|
|
* compressed extents, since the offset is logical and not relative to
|
|
|
|
* the physical extent locations. We must take this into account to
|
|
|
|
* avoid sending clone offsets that go beyond the source file's size,
|
|
|
|
* which would result in the clone ioctl failing with -EINVAL on the
|
|
|
|
* receiving end.
|
|
|
|
*/
|
|
|
|
if (compressed == BTRFS_COMPRESS_NONE)
|
|
|
|
backref_ctx->data_offset = 0;
|
|
|
|
else
|
|
|
|
backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The last extent of a file may be too large due to page alignment.
|
|
|
|
* We need to adjust extent_len in this case so that the checks in
|
|
|
|
* __iterate_backrefs work.
|
|
|
|
*/
|
|
|
|
if (data_offset + num_bytes >= ino_size)
|
2012-07-28 18:44:34 +08:00
|
|
|
backref_ctx->extent_len = ino_size - data_offset;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Now collect all backrefs.
|
|
|
|
*/
|
2012-08-08 04:25:13 +08:00
|
|
|
if (compressed == BTRFS_COMPRESS_NONE)
|
|
|
|
extent_item_pos = logical - found_key.objectid;
|
|
|
|
else
|
|
|
|
extent_item_pos = 0;
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = iterate_extent_inodes(fs_info, found_key.objectid,
|
|
|
|
extent_item_pos, 1, __iterate_backrefs,
|
|
|
|
backref_ctx);
|
2012-08-08 04:25:13 +08:00
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2012-07-28 18:44:34 +08:00
|
|
|
if (!backref_ctx->found_itself) {
|
2012-07-26 05:19:24 +08:00
|
|
|
/* found a bug in backref code? */
|
|
|
|
ret = -EIO;
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_err(fs_info,
|
2016-09-20 22:05:00 +08:00
|
|
|
"did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
|
2016-09-20 22:05:03 +08:00
|
|
|
ino, data_offset, disk_byte, found_key.objectid);
|
2012-07-26 05:19:24 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info,
|
|
|
|
"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
|
|
|
|
data_offset, ino, num_bytes, logical);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2012-07-28 18:44:34 +08:00
|
|
|
if (!backref_ctx->found)
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "no clones found");
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
cur_clone_root = NULL;
|
|
|
|
for (i = 0; i < sctx->clone_roots_cnt; i++) {
|
|
|
|
if (sctx->clone_roots[i].found_refs) {
|
|
|
|
if (!cur_clone_root)
|
|
|
|
cur_clone_root = sctx->clone_roots + i;
|
|
|
|
else if (sctx->clone_roots[i].root == sctx->send_root)
|
|
|
|
/* prefer clones from send_root over others */
|
|
|
|
cur_clone_root = sctx->clone_roots + i;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cur_clone_root) {
|
|
|
|
*found = cur_clone_root;
|
|
|
|
ret = 0;
|
|
|
|
} else {
|
|
|
|
ret = -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(tmp_path);
|
2012-07-28 18:44:34 +08:00
|
|
|
kfree(backref_ctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
static int read_symlink(struct btrfs_root *root,
|
2012-07-26 05:19:24 +08:00
|
|
|
u64 ino,
|
|
|
|
struct fs_path *dest)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
u8 type;
|
|
|
|
u8 compression;
|
|
|
|
unsigned long off;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = ino;
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2016-01-01 02:07:59 +08:00
|
|
|
if (ret) {
|
|
|
|
/*
|
|
|
|
* An empty symlink inode. Can happen in rare error paths when
|
|
|
|
* creating a symlink (transaction committed before the inode
|
|
|
|
* eviction handler removed the symlink inode items and a crash
|
|
|
|
* happened in between or the subvol was snapshoted in between).
|
|
|
|
* Print an informative message to dmesg/syslog so that the user
|
|
|
|
* can delete the symlink.
|
|
|
|
*/
|
|
|
|
btrfs_err(root->fs_info,
|
|
|
|
"Found empty symlink inode %llu at root %llu",
|
|
|
|
ino, root->root_key.objectid);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
type = btrfs_file_extent_type(path->nodes[0], ei);
|
|
|
|
compression = btrfs_file_extent_compression(path->nodes[0], ei);
|
|
|
|
BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
|
|
|
|
BUG_ON(compression);
|
|
|
|
|
|
|
|
off = btrfs_file_extent_inline_start(ei);
|
2014-01-04 13:07:00 +08:00
|
|
|
len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function to generate a file name that is unique in the root of
|
|
|
|
* send_root and parent_root. This is used to generate names for orphan inodes.
|
|
|
|
*/
|
|
|
|
static int gen_unique_name(struct send_ctx *sctx,
|
|
|
|
u64 ino, u64 gen,
|
|
|
|
struct fs_path *dest)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
char tmp[64];
|
|
|
|
int len;
|
|
|
|
u64 idx = 0;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
while (1) {
|
2014-01-22 07:36:38 +08:00
|
|
|
len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
|
2012-07-26 05:19:24 +08:00
|
|
|
ino, gen, idx);
|
2014-02-04 01:24:09 +08:00
|
|
|
ASSERT(len < sizeof(tmp));
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
|
|
|
|
path, BTRFS_FIRST_FREE_OBJECTID,
|
|
|
|
tmp, strlen(tmp), 0);
|
|
|
|
btrfs_release_path(path);
|
|
|
|
if (IS_ERR(di)) {
|
|
|
|
ret = PTR_ERR(di);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (di) {
|
|
|
|
/* not unique, try again */
|
|
|
|
idx++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!sctx->parent_root) {
|
|
|
|
/* unique */
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
|
|
|
|
path, BTRFS_FIRST_FREE_OBJECTID,
|
|
|
|
tmp, strlen(tmp), 0);
|
|
|
|
btrfs_release_path(path);
|
|
|
|
if (IS_ERR(di)) {
|
|
|
|
ret = PTR_ERR(di);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (di) {
|
|
|
|
/* not unique, try again */
|
|
|
|
idx++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* unique */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = fs_path_add(dest, tmp, strlen(tmp));
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
enum inode_state {
|
|
|
|
inode_state_no_change,
|
|
|
|
inode_state_will_create,
|
|
|
|
inode_state_did_create,
|
|
|
|
inode_state_will_delete,
|
|
|
|
inode_state_did_delete,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int left_ret;
|
|
|
|
int right_ret;
|
|
|
|
u64 left_gen;
|
|
|
|
u64 right_gen;
|
|
|
|
|
|
|
|
ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
|
2012-07-27 05:39:10 +08:00
|
|
|
NULL, NULL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0 && ret != -ENOENT)
|
|
|
|
goto out;
|
|
|
|
left_ret = ret;
|
|
|
|
|
|
|
|
if (!sctx->parent_root) {
|
|
|
|
right_ret = -ENOENT;
|
|
|
|
} else {
|
|
|
|
ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
|
2012-07-27 05:39:10 +08:00
|
|
|
NULL, NULL, NULL, NULL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0 && ret != -ENOENT)
|
|
|
|
goto out;
|
|
|
|
right_ret = ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!left_ret && !right_ret) {
|
2012-07-28 22:33:49 +08:00
|
|
|
if (left_gen == gen && right_gen == gen) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = inode_state_no_change;
|
2012-07-28 22:33:49 +08:00
|
|
|
} else if (left_gen == gen) {
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ino < sctx->send_progress)
|
|
|
|
ret = inode_state_did_create;
|
|
|
|
else
|
|
|
|
ret = inode_state_will_create;
|
|
|
|
} else if (right_gen == gen) {
|
|
|
|
if (ino < sctx->send_progress)
|
|
|
|
ret = inode_state_did_delete;
|
|
|
|
else
|
|
|
|
ret = inode_state_will_delete;
|
|
|
|
} else {
|
|
|
|
ret = -ENOENT;
|
|
|
|
}
|
|
|
|
} else if (!left_ret) {
|
|
|
|
if (left_gen == gen) {
|
|
|
|
if (ino < sctx->send_progress)
|
|
|
|
ret = inode_state_did_create;
|
|
|
|
else
|
|
|
|
ret = inode_state_will_create;
|
|
|
|
} else {
|
|
|
|
ret = -ENOENT;
|
|
|
|
}
|
|
|
|
} else if (!right_ret) {
|
|
|
|
if (right_gen == gen) {
|
|
|
|
if (ino < sctx->send_progress)
|
|
|
|
ret = inode_state_did_delete;
|
|
|
|
else
|
|
|
|
ret = inode_state_will_delete;
|
|
|
|
} else {
|
|
|
|
ret = -ENOENT;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ret = -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
Btrfs: send, fix failure to rename top level inode due to name collision
Under certain situations, an incremental send operation can fail due to a
premature attempt to create a new top level inode (a direct child of the
subvolume/snapshot root) whose name collides with another inode that was
removed from the send snapshot.
Consider the following example scenario.
Parent snapshot:
. (ino 256, gen 8)
|---- a1/ (ino 257, gen 9)
|---- a2/ (ino 258, gen 9)
Send snapshot:
. (ino 256, gen 3)
|---- a2/ (ino 257, gen 7)
In this scenario, when receiving the incremental send stream, the btrfs
receive command fails like this (ran in verbose mode, -vv argument):
rmdir a1
mkfile o257-7-0
rename o257-7-0 -> a2
ERROR: rename o257-7-0 -> a2 failed: Is a directory
What happens when computing the incremental send stream is:
1) An operation to remove the directory with inode number 257 and
generation 9 is issued.
2) An operation to create the inode with number 257 and generation 7 is
issued. This creates the inode with an orphanized name of "o257-7-0".
3) An operation rename the new inode 257 to its final name, "a2", is
issued. This is incorrect because inode 258, which has the same name
and it's a child of the same parent (root inode 256), was not yet
processed and therefore no rmdir operation for it was yet issued.
The rename operation is issued because we fail to detect that the
name of the new inode 257 collides with inode 258, because their
parent, a subvolume/snapshot root (inode 256) has a different
generation in both snapshots.
So fix this by ignoring the generation value of a parent directory that
matches a root inode (number 256) when we are checking if the name of the
inode currently being processed collides with the name of some other
inode that was not yet processed.
We can achieve this scenario of different inodes with the same number but
different generation values either by mounting a filesystem with the inode
cache option (-o inode_cache) or by creating and sending snapshots across
different filesystems, like in the following example:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/a1
$ mkdir /mnt/a2
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ btrfs send /mnt/snap1 -f /tmp/1.snap
$ umount /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt
$ touch /mnt/a2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs receive /mnt -f /tmp/1.snap
# Take note that once the filesystem is created, its current
# generation has value 7 so the inode from the second snapshot has
# a generation value of 7. And after receiving the first snapshot
# the filesystem is at a generation value of 10, because the call to
# create the second snapshot bumps the generation to 8 (the snapshot
# creation ioctl does a transaction commit), the receive command calls
# the snapshot creation ioctl to create the first snapshot, which bumps
# the filesystem's generation to 9, and finally when the receive
# operation finishes it calls an ioctl to transition the first snapshot
# (snap1) from RW mode to RO mode, which does another transaction commit
# and bumps the filesystem's generation to 10.
$ rm -f /tmp/1.snap
$ btrfs send /mnt/snap1 -f /tmp/1.snap
$ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/2.snap
$ umount /mnt
$ mkfs.btrfs -f /dev/sdd
$ mount /dev/sdd /mnt
$ btrfs receive /mnt /tmp/1.snap
# Receive of snapshot snap2 used to fail.
$ btrfs receive /mnt /tmp/2.snap
Signed-off-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
[Rewrote changelog to be more precise and clear]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-01-05 16:24:55 +08:00
|
|
|
if (ino == BTRFS_FIRST_FREE_OBJECTID)
|
|
|
|
return 1;
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = get_cur_inode_state(sctx, ino, gen);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ret == inode_state_no_change ||
|
|
|
|
ret == inode_state_did_create ||
|
|
|
|
ret == inode_state_will_delete)
|
|
|
|
ret = 1;
|
|
|
|
else
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function to lookup a dir item in a dir.
|
|
|
|
*/
|
|
|
|
static int lookup_dir_item_inode(struct btrfs_root *root,
|
|
|
|
u64 dir, const char *name, int name_len,
|
|
|
|
u64 *found_inode,
|
|
|
|
u8 *found_type)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
di = btrfs_lookup_dir_item(NULL, root, path,
|
|
|
|
dir, name, name_len, 0);
|
|
|
|
if (!di) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (IS_ERR(di)) {
|
|
|
|
ret = PTR_ERR(di);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
|
2014-05-25 11:49:24 +08:00
|
|
|
if (key.type == BTRFS_ROOT_ITEM_KEY) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
*found_inode = key.objectid;
|
|
|
|
*found_type = btrfs_dir_type(path->nodes[0], di);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
|
|
|
|
* generation of the parent dir and the name of the dir entry.
|
|
|
|
*/
|
2013-05-08 15:51:52 +08:00
|
|
|
static int get_first_ref(struct btrfs_root *root, u64 ino,
|
2012-07-26 05:19:24 +08:00
|
|
|
u64 *dir, u64 *dir_gen, struct fs_path *name)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
int len;
|
2012-10-15 16:30:45 +08:00
|
|
|
u64 parent_dir;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = ino;
|
|
|
|
key.type = BTRFS_INODE_REF_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (!ret)
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
|
|
|
|
path->slots[0]);
|
2012-10-15 16:30:45 +08:00
|
|
|
if (ret || found_key.objectid != ino ||
|
|
|
|
(found_key.type != BTRFS_INODE_REF_KEY &&
|
|
|
|
found_key.type != BTRFS_INODE_EXTREF_KEY)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2014-05-14 05:01:02 +08:00
|
|
|
if (found_key.type == BTRFS_INODE_REF_KEY) {
|
2012-10-15 16:30:45 +08:00
|
|
|
struct btrfs_inode_ref *iref;
|
|
|
|
iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_inode_ref);
|
|
|
|
len = btrfs_inode_ref_name_len(path->nodes[0], iref);
|
|
|
|
ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
|
|
|
|
(unsigned long)(iref + 1),
|
|
|
|
len);
|
|
|
|
parent_dir = found_key.offset;
|
|
|
|
} else {
|
|
|
|
struct btrfs_inode_extref *extref;
|
|
|
|
extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_inode_extref);
|
|
|
|
len = btrfs_inode_extref_name_len(path->nodes[0], extref);
|
|
|
|
ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
|
|
|
|
(unsigned long)&extref->name, len);
|
|
|
|
parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
2014-03-21 20:46:54 +08:00
|
|
|
if (dir_gen) {
|
|
|
|
ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
|
|
|
|
NULL, NULL, NULL);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2012-10-15 16:30:45 +08:00
|
|
|
*dir = parent_dir;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
static int is_first_ref(struct btrfs_root *root,
|
2012-07-26 05:19:24 +08:00
|
|
|
u64 ino, u64 dir,
|
|
|
|
const char *name, int name_len)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct fs_path *tmp_name;
|
|
|
|
u64 tmp_dir;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
tmp_name = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!tmp_name)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2014-03-21 20:46:54 +08:00
|
|
|
ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2012-07-28 17:07:18 +08:00
|
|
|
if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-07-28 22:33:49 +08:00
|
|
|
ret = !memcmp(tmp_name->start, name, name_len);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(tmp_name);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Used by process_recorded_refs to determine if a new ref would overwrite an
|
|
|
|
* already existing ref. In case it detects an overwrite, it returns the
|
|
|
|
* inode/gen in who_ino/who_gen.
|
|
|
|
* When an overwrite is detected, process_recorded_refs does proper orphanizing
|
|
|
|
* to make sure later references to the overwritten inode are possible.
|
|
|
|
* Orphanizing is however only required for the first ref of an inode.
|
|
|
|
* process_recorded_refs does an additional is_first_ref check to see if
|
|
|
|
* orphanizing is really required.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
|
|
|
|
const char *name, int name_len,
|
|
|
|
u64 *who_ino, u64 *who_gen)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
2013-08-07 04:47:48 +08:00
|
|
|
u64 gen;
|
2012-07-26 05:19:24 +08:00
|
|
|
u64 other_inode = 0;
|
|
|
|
u8 other_type = 0;
|
|
|
|
|
|
|
|
if (!sctx->parent_root)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = is_inode_existent(sctx, dir, dir_gen);
|
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
|
|
|
|
2013-08-07 04:47:48 +08:00
|
|
|
/*
|
|
|
|
* If we have a parent root we need to verify that the parent dir was
|
2016-05-20 09:18:45 +08:00
|
|
|
* not deleted and then re-created, if it was then we have no overwrite
|
2013-08-07 04:47:48 +08:00
|
|
|
* and we can just unlink this entry.
|
|
|
|
*/
|
Btrfs: send, fix failure to rename top level inode due to name collision
Under certain situations, an incremental send operation can fail due to a
premature attempt to create a new top level inode (a direct child of the
subvolume/snapshot root) whose name collides with another inode that was
removed from the send snapshot.
Consider the following example scenario.
Parent snapshot:
. (ino 256, gen 8)
|---- a1/ (ino 257, gen 9)
|---- a2/ (ino 258, gen 9)
Send snapshot:
. (ino 256, gen 3)
|---- a2/ (ino 257, gen 7)
In this scenario, when receiving the incremental send stream, the btrfs
receive command fails like this (ran in verbose mode, -vv argument):
rmdir a1
mkfile o257-7-0
rename o257-7-0 -> a2
ERROR: rename o257-7-0 -> a2 failed: Is a directory
What happens when computing the incremental send stream is:
1) An operation to remove the directory with inode number 257 and
generation 9 is issued.
2) An operation to create the inode with number 257 and generation 7 is
issued. This creates the inode with an orphanized name of "o257-7-0".
3) An operation rename the new inode 257 to its final name, "a2", is
issued. This is incorrect because inode 258, which has the same name
and it's a child of the same parent (root inode 256), was not yet
processed and therefore no rmdir operation for it was yet issued.
The rename operation is issued because we fail to detect that the
name of the new inode 257 collides with inode 258, because their
parent, a subvolume/snapshot root (inode 256) has a different
generation in both snapshots.
So fix this by ignoring the generation value of a parent directory that
matches a root inode (number 256) when we are checking if the name of the
inode currently being processed collides with the name of some other
inode that was not yet processed.
We can achieve this scenario of different inodes with the same number but
different generation values either by mounting a filesystem with the inode
cache option (-o inode_cache) or by creating and sending snapshots across
different filesystems, like in the following example:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/a1
$ mkdir /mnt/a2
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ btrfs send /mnt/snap1 -f /tmp/1.snap
$ umount /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt
$ touch /mnt/a2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs receive /mnt -f /tmp/1.snap
# Take note that once the filesystem is created, its current
# generation has value 7 so the inode from the second snapshot has
# a generation value of 7. And after receiving the first snapshot
# the filesystem is at a generation value of 10, because the call to
# create the second snapshot bumps the generation to 8 (the snapshot
# creation ioctl does a transaction commit), the receive command calls
# the snapshot creation ioctl to create the first snapshot, which bumps
# the filesystem's generation to 9, and finally when the receive
# operation finishes it calls an ioctl to transition the first snapshot
# (snap1) from RW mode to RO mode, which does another transaction commit
# and bumps the filesystem's generation to 10.
$ rm -f /tmp/1.snap
$ btrfs send /mnt/snap1 -f /tmp/1.snap
$ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/2.snap
$ umount /mnt
$ mkfs.btrfs -f /dev/sdd
$ mount /dev/sdd /mnt
$ btrfs receive /mnt /tmp/1.snap
# Receive of snapshot snap2 used to fail.
$ btrfs receive /mnt /tmp/2.snap
Signed-off-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
[Rewrote changelog to be more precise and clear]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-01-05 16:24:55 +08:00
|
|
|
if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
|
2013-08-07 04:47:48 +08:00
|
|
|
ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
|
|
|
|
NULL, NULL, NULL);
|
|
|
|
if (ret < 0 && ret != -ENOENT)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (gen != dir_gen)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
|
|
|
|
&other_inode, &other_type);
|
|
|
|
if (ret < 0 && ret != -ENOENT)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Check if the overwritten ref was already processed. If yes, the ref
|
|
|
|
* was already unlinked/moved, so we can safely assume that we will not
|
|
|
|
* overwrite anything at this point in time.
|
|
|
|
*/
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
if (other_inode > sctx->send_progress ||
|
|
|
|
is_waiting_for_move(sctx, other_inode)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = get_inode_info(sctx->parent_root, other_inode, NULL,
|
2012-07-27 05:39:10 +08:00
|
|
|
who_gen, NULL, NULL, NULL, NULL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = 1;
|
|
|
|
*who_ino = other_inode;
|
|
|
|
} else {
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Checks if the ref was overwritten by an already processed inode. This is
|
|
|
|
* used by __get_cur_name_and_parent to find out if the ref was orphanized and
|
|
|
|
* thus the orphan name needs be used.
|
|
|
|
* process_recorded_refs also uses it to avoid unlinking of refs that were
|
|
|
|
* overwritten.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static int did_overwrite_ref(struct send_ctx *sctx,
|
|
|
|
u64 dir, u64 dir_gen,
|
|
|
|
u64 ino, u64 ino_gen,
|
|
|
|
const char *name, int name_len)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
u64 gen;
|
|
|
|
u64 ow_inode;
|
|
|
|
u8 other_type;
|
|
|
|
|
|
|
|
if (!sctx->parent_root)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = is_inode_existent(sctx, dir, dir_gen);
|
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* check if the ref was overwritten by another ref */
|
|
|
|
ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
|
|
|
|
&ow_inode, &other_type);
|
|
|
|
if (ret < 0 && ret != -ENOENT)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
/* was never and will never be overwritten */
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
|
2012-07-27 05:39:10 +08:00
|
|
|
NULL, NULL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ow_inode == ino && gen == ino_gen) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
/*
|
|
|
|
* We know that it is or will be overwritten. Check this now.
|
|
|
|
* The current inode being processed might have been the one that caused
|
2015-09-26 22:30:23 +08:00
|
|
|
* inode 'ino' to be orphanized, therefore check if ow_inode matches
|
|
|
|
* the current inode being processed.
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
*/
|
2015-09-26 22:30:23 +08:00
|
|
|
if ((ow_inode < sctx->send_progress) ||
|
|
|
|
(ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
|
|
|
|
gen == sctx->cur_inode_gen))
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = 1;
|
|
|
|
else
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Same as did_overwrite_ref, but also checks if it is the first ref of an inode
|
|
|
|
* that got overwritten. This is used by process_recorded_refs to determine
|
|
|
|
* if it has to use the path as returned by get_cur_path or the orphan name.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct fs_path *name = NULL;
|
|
|
|
u64 dir;
|
|
|
|
u64 dir_gen;
|
|
|
|
|
|
|
|
if (!sctx->parent_root)
|
|
|
|
goto out;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
name = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!name)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
|
|
|
|
name->start, fs_path_len(name));
|
|
|
|
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(name);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
|
|
|
|
* so we need to do some special handling in case we have clashes. This function
|
|
|
|
* takes care of this with the help of name_cache_entry::radix_list.
|
2012-08-01 18:07:43 +08:00
|
|
|
* In case of error, nce is kfreed.
|
2012-07-28 20:11:31 +08:00
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static int name_cache_insert(struct send_ctx *sctx,
|
|
|
|
struct name_cache_entry *nce)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
2012-07-28 20:20:58 +08:00
|
|
|
struct list_head *nce_head;
|
|
|
|
|
|
|
|
nce_head = radix_tree_lookup(&sctx->name_cache,
|
|
|
|
(unsigned long)nce->ino);
|
|
|
|
if (!nce_head) {
|
2016-01-19 01:42:13 +08:00
|
|
|
nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
|
2012-12-17 14:38:51 +08:00
|
|
|
if (!nce_head) {
|
|
|
|
kfree(nce);
|
2012-07-26 05:19:24 +08:00
|
|
|
return -ENOMEM;
|
2012-12-17 14:38:51 +08:00
|
|
|
}
|
2012-07-28 20:20:58 +08:00
|
|
|
INIT_LIST_HEAD(nce_head);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2012-07-28 20:20:58 +08:00
|
|
|
ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
|
2012-08-01 18:07:43 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
kfree(nce_head);
|
|
|
|
kfree(nce);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
2012-08-01 18:07:43 +08:00
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
2012-07-28 20:20:58 +08:00
|
|
|
list_add_tail(&nce->radix_list, nce_head);
|
2012-07-26 05:19:24 +08:00
|
|
|
list_add_tail(&nce->list, &sctx->name_cache_list);
|
|
|
|
sctx->name_cache_size++;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void name_cache_delete(struct send_ctx *sctx,
|
|
|
|
struct name_cache_entry *nce)
|
|
|
|
{
|
2012-07-28 20:20:58 +08:00
|
|
|
struct list_head *nce_head;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2012-07-28 20:20:58 +08:00
|
|
|
nce_head = radix_tree_lookup(&sctx->name_cache,
|
|
|
|
(unsigned long)nce->ino);
|
2014-02-04 02:24:40 +08:00
|
|
|
if (!nce_head) {
|
|
|
|
btrfs_err(sctx->send_root->fs_info,
|
|
|
|
"name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
|
|
|
|
nce->ino, sctx->name_cache_size);
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2012-07-28 20:20:58 +08:00
|
|
|
list_del(&nce->radix_list);
|
2012-07-26 05:19:24 +08:00
|
|
|
list_del(&nce->list);
|
|
|
|
sctx->name_cache_size--;
|
2012-07-28 20:20:58 +08:00
|
|
|
|
2014-02-04 02:24:40 +08:00
|
|
|
/*
|
|
|
|
* We may not get to the final release of nce_head if the lookup fails
|
|
|
|
*/
|
|
|
|
if (nce_head && list_empty(nce_head)) {
|
2012-07-28 20:20:58 +08:00
|
|
|
radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
|
|
|
|
kfree(nce_head);
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
|
|
|
|
u64 ino, u64 gen)
|
|
|
|
{
|
2012-07-28 20:20:58 +08:00
|
|
|
struct list_head *nce_head;
|
|
|
|
struct name_cache_entry *cur;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2012-07-28 20:20:58 +08:00
|
|
|
nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
|
|
|
|
if (!nce_head)
|
2012-07-26 05:19:24 +08:00
|
|
|
return NULL;
|
|
|
|
|
2012-07-28 20:20:58 +08:00
|
|
|
list_for_each_entry(cur, nce_head, radix_list) {
|
|
|
|
if (cur->ino == ino && cur->gen == gen)
|
|
|
|
return cur;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Removes the entry from the list and adds it back to the end. This marks the
|
|
|
|
* entry as recently used so that name_cache_clean_unused does not remove it.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
|
|
|
|
{
|
|
|
|
list_del(&nce->list);
|
|
|
|
list_add_tail(&nce->list, &sctx->name_cache_list);
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Remove some entries from the beginning of name_cache_list.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static void name_cache_clean_unused(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
struct name_cache_entry *nce;
|
|
|
|
|
|
|
|
if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
|
|
|
|
return;
|
|
|
|
|
|
|
|
while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
|
|
|
|
nce = list_entry(sctx->name_cache_list.next,
|
|
|
|
struct name_cache_entry, list);
|
|
|
|
name_cache_delete(sctx, nce);
|
|
|
|
kfree(nce);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void name_cache_free(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
struct name_cache_entry *nce;
|
|
|
|
|
2012-07-28 22:33:49 +08:00
|
|
|
while (!list_empty(&sctx->name_cache_list)) {
|
|
|
|
nce = list_entry(sctx->name_cache_list.next,
|
|
|
|
struct name_cache_entry, list);
|
2012-07-26 05:19:24 +08:00
|
|
|
name_cache_delete(sctx, nce);
|
2012-07-28 20:13:35 +08:00
|
|
|
kfree(nce);
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Used by get_cur_path for each ref up to the root.
|
|
|
|
* Returns 0 if it succeeded.
|
|
|
|
* Returns 1 if the inode is not existent or got overwritten. In that case, the
|
|
|
|
* name is an orphan name. This instructs get_cur_path to stop iterating. If 1
|
|
|
|
* is returned, parent_ino/parent_gen are not guaranteed to be valid.
|
|
|
|
* Returns <0 in case of error.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static int __get_cur_name_and_parent(struct send_ctx *sctx,
|
|
|
|
u64 ino, u64 gen,
|
|
|
|
u64 *parent_ino,
|
|
|
|
u64 *parent_gen,
|
|
|
|
struct fs_path *dest)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int nce_ret;
|
|
|
|
struct name_cache_entry *nce = NULL;
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* First check if we already did a call to this function with the same
|
|
|
|
* ino/gen. If yes, check if the cache entry is still up-to-date. If yes
|
|
|
|
* return the cached result.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
nce = name_cache_search(sctx, ino, gen);
|
|
|
|
if (nce) {
|
|
|
|
if (ino < sctx->send_progress && nce->need_later_update) {
|
|
|
|
name_cache_delete(sctx, nce);
|
|
|
|
kfree(nce);
|
|
|
|
nce = NULL;
|
|
|
|
} else {
|
|
|
|
name_cache_used(sctx, nce);
|
|
|
|
*parent_ino = nce->parent_ino;
|
|
|
|
*parent_gen = nce->parent_gen;
|
|
|
|
ret = fs_path_add(dest, nce->name, nce->name_len);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = nce->ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* If the inode is not existent yet, add the orphan name and return 1.
|
|
|
|
* This should only happen for the parent dir that we determine in
|
|
|
|
* __record_new_ref
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = is_inode_existent(sctx, ino, gen);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (!ret) {
|
|
|
|
ret = gen_unique_name(sctx, ino, gen, dest);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = 1;
|
|
|
|
goto out_cache;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Depending on whether the inode was already processed or not, use
|
|
|
|
* send_root or parent_root for ref lookup.
|
|
|
|
*/
|
2014-02-21 08:01:32 +08:00
|
|
|
if (ino < sctx->send_progress)
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = get_first_ref(sctx->send_root, ino,
|
|
|
|
parent_ino, parent_gen, dest);
|
2012-07-26 05:19:24 +08:00
|
|
|
else
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = get_first_ref(sctx->parent_root, ino,
|
|
|
|
parent_ino, parent_gen, dest);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Check if the ref was overwritten by an inode's ref that was processed
|
|
|
|
* earlier. If yes, treat as orphan and return 1.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
|
|
|
|
dest->start, dest->end - dest->start);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
fs_path_reset(dest);
|
|
|
|
ret = gen_unique_name(sctx, ino, gen, dest);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
out_cache:
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Store the result of the lookup in the name cache.
|
|
|
|
*/
|
2016-01-19 01:42:13 +08:00
|
|
|
nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!nce) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
nce->ino = ino;
|
|
|
|
nce->gen = gen;
|
|
|
|
nce->parent_ino = *parent_ino;
|
|
|
|
nce->parent_gen = *parent_gen;
|
|
|
|
nce->name_len = fs_path_len(dest);
|
|
|
|
nce->ret = ret;
|
|
|
|
strcpy(nce->name, dest->start);
|
|
|
|
|
|
|
|
if (ino < sctx->send_progress)
|
|
|
|
nce->need_later_update = 0;
|
|
|
|
else
|
|
|
|
nce->need_later_update = 1;
|
|
|
|
|
|
|
|
nce_ret = name_cache_insert(sctx, nce);
|
|
|
|
if (nce_ret < 0)
|
|
|
|
ret = nce_ret;
|
|
|
|
name_cache_clean_unused(sctx);
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Magic happens here. This function returns the first ref to an inode as it
|
|
|
|
* would look like while receiving the stream at this point in time.
|
|
|
|
* We walk the path up to the root. For every inode in between, we check if it
|
|
|
|
* was already processed/sent. If yes, we continue with the parent as found
|
|
|
|
* in send_root. If not, we continue with the parent as found in parent_root.
|
|
|
|
* If we encounter an inode that was deleted at this point in time, we use the
|
|
|
|
* inodes "orphan" name instead of the real name and stop. Same with new inodes
|
|
|
|
* that were not created yet and overwritten inodes/refs.
|
|
|
|
*
|
|
|
|
* When do we have have orphan inodes:
|
|
|
|
* 1. When an inode is freshly created and thus no valid refs are available yet
|
|
|
|
* 2. When a directory lost all it's refs (deleted) but still has dir items
|
|
|
|
* inside which were not processed yet (pending for move/delete). If anyone
|
|
|
|
* tried to get the path to the dir items, it would get a path inside that
|
|
|
|
* orphan directory.
|
|
|
|
* 3. When an inode is moved around or gets new links, it may overwrite the ref
|
|
|
|
* of an unprocessed inode. If in that case the first ref would be
|
|
|
|
* overwritten, the overwritten inode gets "orphanized". Later when we
|
|
|
|
* process this overwritten inode, it is restored at a new place by moving
|
|
|
|
* the orphan inode.
|
|
|
|
*
|
|
|
|
* sctx->send_progress tells this function at which point in time receiving
|
|
|
|
* would be.
|
|
|
|
*/
|
|
|
|
static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
|
|
|
|
struct fs_path *dest)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct fs_path *name = NULL;
|
|
|
|
u64 parent_inode = 0;
|
|
|
|
u64 parent_gen = 0;
|
|
|
|
int stop = 0;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
name = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!name) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
dest->reversed = 1;
|
|
|
|
fs_path_reset(dest);
|
|
|
|
|
|
|
|
while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
struct waiting_dir_move *wdm;
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
fs_path_reset(name);
|
|
|
|
|
2014-02-19 22:31:44 +08:00
|
|
|
if (is_waiting_for_rm(sctx, ino)) {
|
|
|
|
ret = gen_unique_name(sctx, ino, gen, name);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = fs_path_add_path(dest, name);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
wdm = get_waiting_dir_move(sctx, ino);
|
|
|
|
if (wdm && wdm->orphanized) {
|
|
|
|
ret = gen_unique_name(sctx, ino, gen, name);
|
|
|
|
stop = 1;
|
|
|
|
} else if (wdm) {
|
2014-02-21 08:01:32 +08:00
|
|
|
ret = get_first_ref(sctx->parent_root, ino,
|
|
|
|
&parent_inode, &parent_gen, name);
|
|
|
|
} else {
|
|
|
|
ret = __get_cur_name_and_parent(sctx, ino, gen,
|
|
|
|
&parent_inode,
|
|
|
|
&parent_gen, name);
|
|
|
|
if (ret)
|
|
|
|
stop = 1;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = fs_path_add_path(dest, name);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ino = parent_inode;
|
|
|
|
gen = parent_gen;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(name);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!ret)
|
|
|
|
fs_path_unreverse(dest);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
|
|
|
|
*/
|
|
|
|
static int send_subvol_begin(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_root *send_root = sctx->send_root;
|
|
|
|
struct btrfs_root *parent_root = sctx->parent_root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_root_ref *ref;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
char *name = NULL;
|
|
|
|
int namelen;
|
|
|
|
|
2014-01-15 00:26:43 +08:00
|
|
|
path = btrfs_alloc_path();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2016-01-19 01:42:13 +08:00
|
|
|
name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!name) {
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = send_root->objectid;
|
|
|
|
key.type = BTRFS_ROOT_BACKREF_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
|
|
|
|
&key, path, 1, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
|
|
|
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
|
|
|
|
key.objectid != send_root->objectid) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
|
|
|
|
namelen = btrfs_root_ref_name_len(leaf, ref);
|
|
|
|
read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
if (parent_root) {
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
|
2015-10-01 03:23:33 +08:00
|
|
|
|
|
|
|
if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
|
|
|
|
TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
|
|
|
|
sctx->send_root->root_item.received_uuid);
|
|
|
|
else
|
|
|
|
TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
|
|
|
|
sctx->send_root->root_item.uuid);
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
|
2013-12-03 23:55:48 +08:00
|
|
|
le64_to_cpu(sctx->send_root->root_item.ctransid));
|
2012-07-26 05:19:24 +08:00
|
|
|
if (parent_root) {
|
2015-06-05 05:17:25 +08:00
|
|
|
if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
|
|
|
|
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
|
|
|
|
parent_root->root_item.received_uuid);
|
|
|
|
else
|
|
|
|
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
|
|
|
|
parent_root->root_item.uuid);
|
2012-07-26 05:19:24 +08:00
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
|
2013-12-03 23:55:48 +08:00
|
|
|
le64_to_cpu(sctx->parent_root->root_item.ctransid));
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
kfree(name);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
|
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret = 0;
|
|
|
|
struct fs_path *p;
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = get_cur_path(sctx, ino, gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
|
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret = 0;
|
|
|
|
struct fs_path *p;
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = get_cur_path(sctx, ino, gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
|
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret = 0;
|
|
|
|
struct fs_path *p;
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
|
|
|
|
ino, uid, gid);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = get_cur_path(sctx, ino, gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
|
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret = 0;
|
|
|
|
struct fs_path *p = NULL;
|
|
|
|
struct btrfs_inode_item *ii;
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
struct btrfs_key key;
|
|
|
|
int slot;
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "send_utimes %llu", ino);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = ino;
|
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
|
2016-07-02 12:43:46 +08:00
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
|
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = get_cur_path(sctx, ino, gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
|
2014-12-13 00:39:12 +08:00
|
|
|
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
|
|
|
|
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
|
|
|
|
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
|
2012-07-28 20:11:31 +08:00
|
|
|
/* TODO Add otime support when the otime patches get into upstream */
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
|
|
|
|
* a valid path yet because we did not process the refs yet. So, the inode
|
|
|
|
* is created as orphan.
|
|
|
|
*/
|
2012-07-28 16:42:24 +08:00
|
|
|
static int send_create_inode(struct send_ctx *sctx, u64 ino)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret = 0;
|
|
|
|
struct fs_path *p;
|
|
|
|
int cmd;
|
2012-07-28 16:42:24 +08:00
|
|
|
u64 gen;
|
2012-07-26 05:19:24 +08:00
|
|
|
u64 mode;
|
2012-07-28 16:42:24 +08:00
|
|
|
u64 rdev;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "send_create_inode %llu", ino);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2014-02-27 17:29:01 +08:00
|
|
|
if (ino != sctx->cur_ino) {
|
|
|
|
ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
|
|
|
|
NULL, NULL, &rdev);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
gen = sctx->cur_inode_gen;
|
|
|
|
mode = sctx->cur_inode_mode;
|
|
|
|
rdev = sctx->cur_inode_rdev;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2012-07-28 22:33:49 +08:00
|
|
|
if (S_ISREG(mode)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
cmd = BTRFS_SEND_C_MKFILE;
|
2012-07-28 22:33:49 +08:00
|
|
|
} else if (S_ISDIR(mode)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
cmd = BTRFS_SEND_C_MKDIR;
|
2012-07-28 22:33:49 +08:00
|
|
|
} else if (S_ISLNK(mode)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
cmd = BTRFS_SEND_C_SYMLINK;
|
2012-07-28 22:33:49 +08:00
|
|
|
} else if (S_ISCHR(mode) || S_ISBLK(mode)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
cmd = BTRFS_SEND_C_MKNOD;
|
2012-07-28 22:33:49 +08:00
|
|
|
} else if (S_ISFIFO(mode)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
cmd = BTRFS_SEND_C_MKFIFO;
|
2012-07-28 22:33:49 +08:00
|
|
|
} else if (S_ISSOCK(mode)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
cmd = BTRFS_SEND_C_MKSOCK;
|
2012-07-28 22:33:49 +08:00
|
|
|
} else {
|
2015-10-08 17:37:06 +08:00
|
|
|
btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
|
2012-07-26 05:19:24 +08:00
|
|
|
(int)(mode & S_IFMT));
|
|
|
|
ret = -ENOTSUPP;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = begin_cmd(sctx, cmd);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2012-07-28 16:42:24 +08:00
|
|
|
ret = gen_unique_name(sctx, ino, gen, p);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
|
2012-07-28 16:42:24 +08:00
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
if (S_ISLNK(mode)) {
|
|
|
|
fs_path_reset(p);
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = read_symlink(sctx->send_root, ino, p);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
|
|
|
|
} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
|
|
|
|
S_ISFIFO(mode) || S_ISSOCK(mode)) {
|
2012-10-16 02:28:46 +08:00
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-28 16:42:24 +08:00
|
|
|
/*
|
|
|
|
* We need some special handling for inodes that get processed before the parent
|
|
|
|
* directory got created. See process_recorded_refs for details.
|
|
|
|
* This function does the check if we already created the dir out of order.
|
|
|
|
*/
|
|
|
|
static int did_create_dir(struct send_ctx *sctx, u64 dir)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_key di_key;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
int slot;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = dir;
|
|
|
|
key.type = BTRFS_DIR_INDEX_KEY;
|
|
|
|
key.offset = 0;
|
2014-02-06 00:48:56 +08:00
|
|
|
ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2012-07-28 16:42:24 +08:00
|
|
|
while (1) {
|
2014-02-06 00:48:56 +08:00
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
if (slot >= btrfs_header_nritems(eb)) {
|
|
|
|
ret = btrfs_next_leaf(sctx->send_root, path);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
} else if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
2012-07-28 16:42:24 +08:00
|
|
|
}
|
2014-02-06 00:48:56 +08:00
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(eb, &found_key, slot);
|
|
|
|
if (found_key.objectid != key.objectid ||
|
2012-07-28 16:42:24 +08:00
|
|
|
found_key.type != key.type) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
|
|
|
|
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
|
|
|
|
|
2013-08-12 22:56:14 +08:00
|
|
|
if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
|
|
|
|
di_key.objectid < sctx->send_progress) {
|
2012-07-28 16:42:24 +08:00
|
|
|
ret = 1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2014-02-06 00:48:56 +08:00
|
|
|
path->slots[0]++;
|
2012-07-28 16:42:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only creates the inode if it is:
|
|
|
|
* 1. Not a directory
|
|
|
|
* 2. Or a directory which was not created already due to out of order
|
|
|
|
* directories. See did_create_dir and process_recorded_refs for details.
|
|
|
|
*/
|
|
|
|
static int send_create_inode_if_needed(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (S_ISDIR(sctx->cur_inode_mode)) {
|
|
|
|
ret = did_create_dir(sctx, sctx->cur_ino);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = send_create_inode(sctx, sctx->cur_ino);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
struct recorded_ref {
|
|
|
|
struct list_head list;
|
|
|
|
char *dir_path;
|
|
|
|
char *name;
|
|
|
|
struct fs_path *full_path;
|
|
|
|
u64 dir;
|
|
|
|
u64 dir_gen;
|
|
|
|
int dir_path_len;
|
|
|
|
int name_len;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to process new refs before deleted refs, but compare_tree gives us
|
|
|
|
* everything mixed. So we first record all refs and later process them.
|
|
|
|
* This function is a helper to record one ref.
|
|
|
|
*/
|
2014-03-03 21:31:03 +08:00
|
|
|
static int __record_ref(struct list_head *head, u64 dir,
|
2012-07-26 05:19:24 +08:00
|
|
|
u64 dir_gen, struct fs_path *path)
|
|
|
|
{
|
|
|
|
struct recorded_ref *ref;
|
|
|
|
|
2016-01-19 01:42:13 +08:00
|
|
|
ref = kmalloc(sizeof(*ref), GFP_KERNEL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!ref)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ref->dir = dir;
|
|
|
|
ref->dir_gen = dir_gen;
|
|
|
|
ref->full_path = path;
|
|
|
|
|
2013-08-21 15:32:13 +08:00
|
|
|
ref->name = (char *)kbasename(ref->full_path->start);
|
|
|
|
ref->name_len = ref->full_path->end - ref->name;
|
|
|
|
ref->dir_path = ref->full_path->start;
|
|
|
|
if (ref->name == ref->full_path->start)
|
2012-07-26 05:19:24 +08:00
|
|
|
ref->dir_path_len = 0;
|
2013-08-21 15:32:13 +08:00
|
|
|
else
|
2012-07-26 05:19:24 +08:00
|
|
|
ref->dir_path_len = ref->full_path->end -
|
|
|
|
ref->full_path->start - 1 - ref->name_len;
|
|
|
|
|
|
|
|
list_add_tail(&ref->list, head);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-08-17 04:52:55 +08:00
|
|
|
static int dup_ref(struct recorded_ref *ref, struct list_head *list)
|
|
|
|
{
|
|
|
|
struct recorded_ref *new;
|
|
|
|
|
2016-01-19 01:42:13 +08:00
|
|
|
new = kmalloc(sizeof(*ref), GFP_KERNEL);
|
2013-08-17 04:52:55 +08:00
|
|
|
if (!new)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
new->dir = ref->dir;
|
|
|
|
new->dir_gen = ref->dir_gen;
|
|
|
|
new->full_path = NULL;
|
|
|
|
INIT_LIST_HEAD(&new->list);
|
|
|
|
list_add_tail(&new->list, list);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
static void __free_recorded_refs(struct list_head *head)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
struct recorded_ref *cur;
|
|
|
|
|
2012-07-28 22:33:49 +08:00
|
|
|
while (!list_empty(head)) {
|
|
|
|
cur = list_entry(head->next, struct recorded_ref, list);
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(cur->full_path);
|
2012-07-28 22:33:49 +08:00
|
|
|
list_del(&cur->list);
|
2012-07-26 05:19:24 +08:00
|
|
|
kfree(cur);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_recorded_refs(struct send_ctx *sctx)
|
|
|
|
{
|
2013-05-08 15:51:52 +08:00
|
|
|
__free_recorded_refs(&sctx->new_refs);
|
|
|
|
__free_recorded_refs(&sctx->deleted_refs);
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2012-07-28 20:11:31 +08:00
|
|
|
* Renames/moves a file/dir to its orphan name. Used when the first
|
2012-07-26 05:19:24 +08:00
|
|
|
* ref of an unprocessed inode gets overwritten and for all non empty
|
|
|
|
* directories.
|
|
|
|
*/
|
|
|
|
static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
|
|
|
|
struct fs_path *path)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct fs_path *orphan;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
orphan = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!orphan)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = gen_unique_name(sctx, ino, gen, orphan);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = send_rename(sctx, path, orphan);
|
|
|
|
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(orphan);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-02-19 22:31:44 +08:00
|
|
|
static struct orphan_dir_info *
|
|
|
|
add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
|
|
|
|
{
|
|
|
|
struct rb_node **p = &sctx->orphan_dirs.rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct orphan_dir_info *entry, *odi;
|
|
|
|
|
2016-01-19 01:42:13 +08:00
|
|
|
odi = kmalloc(sizeof(*odi), GFP_KERNEL);
|
2014-02-19 22:31:44 +08:00
|
|
|
if (!odi)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
odi->ino = dir_ino;
|
|
|
|
odi->gen = 0;
|
|
|
|
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
entry = rb_entry(parent, struct orphan_dir_info, node);
|
|
|
|
if (dir_ino < entry->ino) {
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
} else if (dir_ino > entry->ino) {
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
} else {
|
|
|
|
kfree(odi);
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&odi->node, parent, p);
|
|
|
|
rb_insert_color(&odi->node, &sctx->orphan_dirs);
|
|
|
|
return odi;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct orphan_dir_info *
|
|
|
|
get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
|
|
|
|
{
|
|
|
|
struct rb_node *n = sctx->orphan_dirs.rb_node;
|
|
|
|
struct orphan_dir_info *entry;
|
|
|
|
|
|
|
|
while (n) {
|
|
|
|
entry = rb_entry(n, struct orphan_dir_info, node);
|
|
|
|
if (dir_ino < entry->ino)
|
|
|
|
n = n->rb_left;
|
|
|
|
else if (dir_ino > entry->ino)
|
|
|
|
n = n->rb_right;
|
|
|
|
else
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
|
|
|
|
{
|
|
|
|
struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
|
|
|
|
|
|
|
|
return odi != NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_orphan_dir_info(struct send_ctx *sctx,
|
|
|
|
struct orphan_dir_info *odi)
|
|
|
|
{
|
|
|
|
if (!odi)
|
|
|
|
return;
|
|
|
|
rb_erase(&odi->node, &sctx->orphan_dirs);
|
|
|
|
kfree(odi);
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
/*
|
|
|
|
* Returns 1 if a directory can be removed at this point in time.
|
|
|
|
* We check this by iterating all dir items and checking if the inode behind
|
|
|
|
* the dir item was already processed.
|
|
|
|
*/
|
2014-02-19 22:31:44 +08:00
|
|
|
static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
|
|
|
|
u64 send_progress)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_root *root = sctx->parent_root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_key loc;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
|
2012-08-01 20:48:59 +08:00
|
|
|
/*
|
|
|
|
* Don't try to rmdir the top/root subvolume dir.
|
|
|
|
*/
|
|
|
|
if (dir == BTRFS_FIRST_FREE_OBJECTID)
|
|
|
|
return 0;
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = dir;
|
|
|
|
key.type = BTRFS_DIR_INDEX_KEY;
|
|
|
|
key.offset = 0;
|
2014-02-06 00:48:56 +08:00
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
while (1) {
|
2014-02-19 22:31:44 +08:00
|
|
|
struct waiting_dir_move *dm;
|
|
|
|
|
2014-02-06 00:48:56 +08:00
|
|
|
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
else if (ret > 0)
|
|
|
|
break;
|
|
|
|
continue;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
2014-02-06 00:48:56 +08:00
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
|
|
|
|
path->slots[0]);
|
|
|
|
if (found_key.objectid != key.objectid ||
|
|
|
|
found_key.type != key.type)
|
2012-07-26 05:19:24 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
di = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_dir_item);
|
|
|
|
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
|
|
|
|
|
2014-02-19 22:31:44 +08:00
|
|
|
dm = get_waiting_dir_move(sctx, loc.objectid);
|
|
|
|
if (dm) {
|
|
|
|
struct orphan_dir_info *odi;
|
|
|
|
|
|
|
|
odi = add_orphan_dir_info(sctx, dir);
|
|
|
|
if (IS_ERR(odi)) {
|
|
|
|
ret = PTR_ERR(odi);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
odi->gen = dir_gen;
|
|
|
|
dm->rmdir_ino = dir;
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
if (loc.objectid > send_progress) {
|
Btrfs: send, fix warning due to late freeing of orphan_dir_info structures
Under certain situations, when doing an incremental send, we can end up
not freeing orphan_dir_info structures as soon as they are no longer
needed. Instead we end up freeing them only after finishing the send
stream, which causes a warning to be emitted:
[282735.229200] ------------[ cut here ]------------
[282735.229968] WARNING: CPU: 9 PID: 10588 at fs/btrfs/send.c:6298 btrfs_ioctl_send+0xe2f/0xe51 [btrfs]
[282735.231282] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[282735.237130] CPU: 9 PID: 10588 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[282735.239309] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[282735.240160] 0000000000000000 ffff880224273ca8 ffffffff8126b42c 0000000000000000
[282735.240160] 0000000000000000 ffff880224273ce8 ffffffff81052b14 0000189a24273ac8
[282735.240160] ffff8802210c9800 0000000000000000 0000000000000001 0000000000000000
[282735.240160] Call Trace:
[282735.240160] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[282735.240160] [<ffffffff81052b14>] __warn+0xc2/0xdd
[282735.240160] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[282735.240160] [<ffffffffa03c99d5>] btrfs_ioctl_send+0xe2f/0xe51 [btrfs]
[282735.240160] [<ffffffffa0398358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[282735.240160] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[282735.240160] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[282735.240160] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[282735.240160] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[282735.240160] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[282735.240160] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[282735.240160] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[282735.240160] [<ffffffff81100c6b>] ? time_hardirqs_off+0x9/0x14
[282735.240160] [<ffffffff8108e87d>] ? trace_hardirqs_off_caller+0x1f/0xaa
[282735.256343] ---[ end trace a4539270c8056f93 ]---
Consider the following example:
Parent snapshot:
. (ino 256)
|--- a/ (ino 257)
| |--- c/ (ino 260)
|
|--- del/ (ino 259)
|--- tmp/ (ino 258)
|--- x/ (ino 261)
|--- y/ (ino 262)
Send snapshot:
. (ino 256)
|--- a/ (ino 257)
| |--- x/ (ino 261)
| |--- y/ (ino 262)
|
|--- c/ (ino 260)
|--- tmp/ (ino 258)
1) When processing inode 258, we end up delaying its rename operation
because it has an ancestor (in the send snapshot) that has a higher
inode number (inode 260) which was also renamed in the send snapshot,
therefore we delay the rename of inode 258 so that it happens after
inode 260 is renamed;
2) When processing inode 259, we end up delaying its deletion (rmdir
operation) because it has a child inode (258) that has its rename
operation delayed. At this point we allocate an orphan_dir_info
structure and tag inode 258 so that we later attempt to see if we
can delete (rmdir) inode 259 once inode 258 is renamed;
3) When we process inode 260, after renaming it we finally do the rename
operation for inode 258. Once we issue the rename operation for inode
258 we notice that this inode was tagged so that we attempt to see
if at this point we can delete (rmdir) inode 259. But at this point
we can not still delete inode 259 because it has 2 children, inodes
261 and 262, that were not yet processed and therefore not yet
moved (renamed) away from inode 259. We end up not freeing the
orphan_dir_info structure allocated in step 2;
4) We process inodes 261 and 262, and once we move/rename inode 262
we issue the rmdir operation for inode 260;
5) We finish the send stream and notice that red black tree that
contains orphan_dir_info structures is not empty, so we emit
a warning and then free any orphan_dir_structures left.
So fix this by freeing an orphan_dir_info structure once we try to
apply a pending rename operation if we can not delete yet the tagged
directory.
A test case for fstests follows soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Modified changelog to be more detailed and easier to understand]
2015-06-22 17:08:45 +08:00
|
|
|
struct orphan_dir_info *odi;
|
|
|
|
|
|
|
|
odi = get_orphan_dir_info(sctx, dir);
|
|
|
|
free_orphan_dir_info(sctx, odi);
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2014-02-06 00:48:56 +08:00
|
|
|
path->slots[0]++;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = 1;
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
|
|
|
|
{
|
2014-02-19 22:31:44 +08:00
|
|
|
struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
|
2014-02-19 22:31:44 +08:00
|
|
|
return entry != NULL;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
{
|
|
|
|
struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct waiting_dir_move *entry, *dm;
|
|
|
|
|
2016-01-19 01:42:13 +08:00
|
|
|
dm = kmalloc(sizeof(*dm), GFP_KERNEL);
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
if (!dm)
|
|
|
|
return -ENOMEM;
|
|
|
|
dm->ino = ino;
|
2014-02-19 22:31:44 +08:00
|
|
|
dm->rmdir_ino = 0;
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
dm->orphanized = orphanized;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
entry = rb_entry(parent, struct waiting_dir_move, node);
|
|
|
|
if (ino < entry->ino) {
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
} else if (ino > entry->ino) {
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
} else {
|
|
|
|
kfree(dm);
|
|
|
|
return -EEXIST;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&dm->node, parent, p);
|
|
|
|
rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-02-19 22:31:44 +08:00
|
|
|
static struct waiting_dir_move *
|
|
|
|
get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
{
|
|
|
|
struct rb_node *n = sctx->waiting_dir_moves.rb_node;
|
|
|
|
struct waiting_dir_move *entry;
|
|
|
|
|
|
|
|
while (n) {
|
|
|
|
entry = rb_entry(n, struct waiting_dir_move, node);
|
2014-02-19 22:31:44 +08:00
|
|
|
if (ino < entry->ino)
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
n = n->rb_left;
|
2014-02-19 22:31:44 +08:00
|
|
|
else if (ino > entry->ino)
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
n = n->rb_right;
|
2014-02-19 22:31:44 +08:00
|
|
|
else
|
|
|
|
return entry;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
}
|
2014-02-19 22:31:44 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_waiting_dir_move(struct send_ctx *sctx,
|
|
|
|
struct waiting_dir_move *dm)
|
|
|
|
{
|
|
|
|
if (!dm)
|
|
|
|
return;
|
|
|
|
rb_erase(&dm->node, &sctx->waiting_dir_moves);
|
|
|
|
kfree(dm);
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
}
|
|
|
|
|
2014-03-19 22:20:54 +08:00
|
|
|
static int add_pending_dir_move(struct send_ctx *sctx,
|
|
|
|
u64 ino,
|
|
|
|
u64 ino_gen,
|
2014-03-28 04:14:01 +08:00
|
|
|
u64 parent_ino,
|
|
|
|
struct list_head *new_refs,
|
2015-03-01 06:29:22 +08:00
|
|
|
struct list_head *deleted_refs,
|
|
|
|
const bool is_orphan)
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
{
|
|
|
|
struct rb_node **p = &sctx->pending_dir_moves.rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
2014-03-22 06:30:44 +08:00
|
|
|
struct pending_dir_move *entry = NULL, *pm;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
struct recorded_ref *cur;
|
|
|
|
int exists = 0;
|
|
|
|
int ret;
|
|
|
|
|
2016-01-19 01:42:13 +08:00
|
|
|
pm = kmalloc(sizeof(*pm), GFP_KERNEL);
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
if (!pm)
|
|
|
|
return -ENOMEM;
|
|
|
|
pm->parent_ino = parent_ino;
|
2014-03-19 22:20:54 +08:00
|
|
|
pm->ino = ino;
|
|
|
|
pm->gen = ino_gen;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
INIT_LIST_HEAD(&pm->list);
|
|
|
|
INIT_LIST_HEAD(&pm->update_refs);
|
|
|
|
RB_CLEAR_NODE(&pm->node);
|
|
|
|
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
entry = rb_entry(parent, struct pending_dir_move, node);
|
|
|
|
if (parent_ino < entry->parent_ino) {
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
} else if (parent_ino > entry->parent_ino) {
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
} else {
|
|
|
|
exists = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-28 04:14:01 +08:00
|
|
|
list_for_each_entry(cur, deleted_refs, list) {
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
ret = dup_ref(cur, &pm->update_refs);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
2014-03-28 04:14:01 +08:00
|
|
|
list_for_each_entry(cur, new_refs, list) {
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
ret = dup_ref(cur, &pm->update_refs);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (exists) {
|
|
|
|
list_add_tail(&pm->list, &entry->list);
|
|
|
|
} else {
|
|
|
|
rb_link_node(&pm->node, parent, p);
|
|
|
|
rb_insert_color(&pm->node, &sctx->pending_dir_moves);
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
if (ret) {
|
|
|
|
__free_recorded_refs(&pm->update_refs);
|
|
|
|
kfree(pm);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
|
|
|
|
u64 parent_ino)
|
|
|
|
{
|
|
|
|
struct rb_node *n = sctx->pending_dir_moves.rb_node;
|
|
|
|
struct pending_dir_move *entry;
|
|
|
|
|
|
|
|
while (n) {
|
|
|
|
entry = rb_entry(n, struct pending_dir_move, node);
|
|
|
|
if (parent_ino < entry->parent_ino)
|
|
|
|
n = n->rb_left;
|
|
|
|
else if (parent_ino > entry->parent_ino)
|
|
|
|
n = n->rb_right;
|
|
|
|
else
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
static int path_loop(struct send_ctx *sctx, struct fs_path *name,
|
|
|
|
u64 ino, u64 gen, u64 *ancestor_ino)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
u64 parent_inode = 0;
|
|
|
|
u64 parent_gen = 0;
|
|
|
|
u64 start_ino = ino;
|
|
|
|
|
|
|
|
*ancestor_ino = 0;
|
|
|
|
while (ino != BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
fs_path_reset(name);
|
|
|
|
|
|
|
|
if (is_waiting_for_rm(sctx, ino))
|
|
|
|
break;
|
|
|
|
if (is_waiting_for_move(sctx, ino)) {
|
|
|
|
if (*ancestor_ino == 0)
|
|
|
|
*ancestor_ino = ino;
|
|
|
|
ret = get_first_ref(sctx->parent_root, ino,
|
|
|
|
&parent_inode, &parent_gen, name);
|
|
|
|
} else {
|
|
|
|
ret = __get_cur_name_and_parent(sctx, ino, gen,
|
|
|
|
&parent_inode,
|
|
|
|
&parent_gen, name);
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (ret < 0)
|
|
|
|
break;
|
|
|
|
if (parent_inode == start_ino) {
|
|
|
|
ret = 1;
|
|
|
|
if (*ancestor_ino == 0)
|
|
|
|
*ancestor_ino = ino;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ino = parent_inode;
|
|
|
|
gen = parent_gen;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
|
|
|
|
{
|
|
|
|
struct fs_path *from_path = NULL;
|
|
|
|
struct fs_path *to_path = NULL;
|
2014-02-16 21:43:11 +08:00
|
|
|
struct fs_path *name = NULL;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
u64 orig_progress = sctx->send_progress;
|
|
|
|
struct recorded_ref *cur;
|
2014-02-16 21:43:11 +08:00
|
|
|
u64 parent_ino, parent_gen;
|
2014-02-19 22:31:44 +08:00
|
|
|
struct waiting_dir_move *dm = NULL;
|
|
|
|
u64 rmdir_ino = 0;
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
u64 ancestor;
|
|
|
|
bool is_orphan;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
int ret;
|
|
|
|
|
2014-02-16 21:43:11 +08:00
|
|
|
name = fs_path_alloc();
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
from_path = fs_path_alloc();
|
2014-02-16 21:43:11 +08:00
|
|
|
if (!name || !from_path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
|
2014-02-19 22:31:44 +08:00
|
|
|
dm = get_waiting_dir_move(sctx, pm->ino);
|
|
|
|
ASSERT(dm);
|
|
|
|
rmdir_ino = dm->rmdir_ino;
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
is_orphan = dm->orphanized;
|
2014-02-19 22:31:44 +08:00
|
|
|
free_waiting_dir_move(sctx, dm);
|
2014-02-16 21:43:11 +08:00
|
|
|
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
if (is_orphan) {
|
2015-03-01 06:29:22 +08:00
|
|
|
ret = gen_unique_name(sctx, pm->ino,
|
|
|
|
pm->gen, from_path);
|
|
|
|
} else {
|
|
|
|
ret = get_first_ref(sctx->parent_root, pm->ino,
|
|
|
|
&parent_ino, &parent_gen, name);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = get_cur_path(sctx, parent_ino, parent_gen,
|
|
|
|
from_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = fs_path_add_path(from_path, name);
|
|
|
|
}
|
2014-03-23 01:15:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2014-02-16 21:43:11 +08:00
|
|
|
|
2014-03-28 04:14:01 +08:00
|
|
|
sctx->send_progress = sctx->cur_ino + 1;
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
|
2016-06-18 00:13:36 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
if (ret) {
|
|
|
|
LIST_HEAD(deleted_refs);
|
|
|
|
ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
|
|
|
|
ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
|
|
|
|
&pm->update_refs, &deleted_refs,
|
|
|
|
is_orphan);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (rmdir_ino) {
|
|
|
|
dm = get_waiting_dir_move(sctx, pm->ino);
|
|
|
|
ASSERT(dm);
|
|
|
|
dm->rmdir_ino = rmdir_ino;
|
|
|
|
}
|
|
|
|
goto out;
|
|
|
|
}
|
2014-03-23 01:15:24 +08:00
|
|
|
fs_path_reset(name);
|
|
|
|
to_path = name;
|
2014-02-16 21:43:11 +08:00
|
|
|
name = NULL;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = send_rename(sctx, from_path, to_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2014-02-19 22:31:44 +08:00
|
|
|
if (rmdir_ino) {
|
|
|
|
struct orphan_dir_info *odi;
|
|
|
|
|
|
|
|
odi = get_orphan_dir_info(sctx, rmdir_ino);
|
|
|
|
if (!odi) {
|
|
|
|
/* already deleted */
|
|
|
|
goto finish;
|
|
|
|
}
|
Btrfs: incremental send, fix premature rmdir operations
Under certain situations, an incremental send operation can contain
a rmdir operation that will make the receiving end fail when attempting
to execute it, because the target directory is not yet empty.
Consider the following example:
Parent snapshot:
. (ino 256)
|--- a/ (ino 257)
| |--- c/ (ino 260)
|
|--- del/ (ino 259)
|--- tmp/ (ino 258)
|--- x/ (ino 261)
Send snapshot:
. (ino 256)
|--- a/ (ino 257)
| |--- x/ (ino 261)
|
|--- c/ (ino 260)
|--- tmp/ (ino 258)
1) When processing inode 258, we delay its rename operation because inode
260 is its new parent in the send snapshot and it was not yet renamed
(since 260 > 258, that is, beyond the current progress);
2) When processing inode 259, we realize we can not yet send an rmdir
operation (against inode 259) because inode 258 was still not yet
renamed/moved away from inode 259. Therefore we update data structures
so that after inode 258 is renamed, we try again to see if we can
finally send an rmdir operation for inode 259;
3) When we process inode 260, we send a rename operation for it followed
by a rename operation for inode 258. Once we send the rename operation
for inode 258 we then check if we can finally issue an rmdir for its
previous parent, inode 259, by calling the can_rmdir() function with
a value of sctx->cur_ino + 1 (260 + 1 = 261) for its "progress"
argument. This makes can_rmdir() return true (value 1) because even
though there's still a child inode of inode 259 that was not yet
renamed/moved, which is inode 261, the given value of progress (261)
is not lower then 261 (that is, not lower than the inode number of
some child of inode 259). So we end up sending a rmdir operation for
inode 259 before its child inode 261 is processed and renamed.
So fix this by passing the correct progress value to the call to
can_rmdir() from within apply_dir_move() (where we issue delayed rename
operations), which should match stcx->cur_ino (the number of the inode
currently being processed) and not sctx->cur_ino + 1.
A test case for fstests follows soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed, clear and well formatted]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:49 +08:00
|
|
|
ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino);
|
2014-02-19 22:31:44 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (!ret)
|
|
|
|
goto finish;
|
|
|
|
|
|
|
|
name = fs_path_alloc();
|
|
|
|
if (!name) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = send_rmdir(sctx, name);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
free_orphan_dir_info(sctx, odi);
|
|
|
|
}
|
|
|
|
|
|
|
|
finish:
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
ret = send_utimes(sctx, pm->ino, pm->gen);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* After rename/move, need to update the utimes of both new parent(s)
|
|
|
|
* and old parent(s).
|
|
|
|
*/
|
|
|
|
list_for_each_entry(cur, &pm->update_refs, list) {
|
Btrfs: send, fix invalid leaf accesses due to incorrect utimes operations
During an incremental send, if we have delayed rename operations for inodes
that were children of directories which were removed in the send snapshot,
we can end up accessing incorrect items in a leaf or accessing beyond the
last item of the leaf due to issuing utimes operations for the removed
inodes. Consider the following example:
Parent snapshot:
. (ino 256)
|--- a/ (ino 257)
| |--- c/ (ino 262)
|
|--- b/ (ino 258)
| |--- d/ (ino 263)
|
|--- del/ (ino 261)
|--- x/ (ino 259)
|--- y/ (ino 260)
Send snapshot:
. (ino 256)
|--- a/ (ino 257)
|
|--- b/ (ino 258)
|
|--- c/ (ino 262)
| |--- y/ (ino 260)
|
|--- d/ (ino 263)
|--- x/ (ino 259)
1) When processing inodes 259 and 260, we end up delaying their rename
operations because their parents, inodes 263 and 262 respectively, were
not yet processed and therefore not yet renamed;
2) When processing inode 262, its rename operation is issued and right
after the rename operation for inode 260 is issued. However right after
issuing the rename operation for inode 260, at send.c:apply_dir_move(),
we issue utimes operations for all current and past parents of inode
260. This means we try to send a utimes operation for its old parent,
inode 261 (deleted in the send snapshot), which does not cause any
immediate and deterministic failure, because when the target inode is
not found in the send snapshot, the send.c:send_utimes() function
ignores it and uses the leaf region pointed to by path->slots[0],
which can be any unrelated item (belonging to other inode) or it can
be a region outside the leaf boundaries, if the leaf is full and
path->slots[0] matches the number of items in the leaf. So we end
up either successfully sending a utimes operation, which is fine
and irrelevant because the old parent (inode 261) will end up being
deleted later, or we end up doing an invalid memory access tha
crashes the kernel.
So fix this by making apply_dir_move() issue utimes operations only for
parents that still exist in the send snapshot. In a separate patch we
will make send_utimes() return an error (-ENOENT) if the given inode
does not exists in the send snapshot.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and better organized]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:50 +08:00
|
|
|
/*
|
|
|
|
* The parent inode might have been deleted in the send snapshot
|
|
|
|
*/
|
|
|
|
ret = get_inode_info(sctx->send_root, cur->dir, NULL,
|
|
|
|
NULL, NULL, NULL, NULL, NULL);
|
|
|
|
if (ret == -ENOENT) {
|
|
|
|
ret = 0;
|
2014-02-19 22:31:44 +08:00
|
|
|
continue;
|
Btrfs: send, fix invalid leaf accesses due to incorrect utimes operations
During an incremental send, if we have delayed rename operations for inodes
that were children of directories which were removed in the send snapshot,
we can end up accessing incorrect items in a leaf or accessing beyond the
last item of the leaf due to issuing utimes operations for the removed
inodes. Consider the following example:
Parent snapshot:
. (ino 256)
|--- a/ (ino 257)
| |--- c/ (ino 262)
|
|--- b/ (ino 258)
| |--- d/ (ino 263)
|
|--- del/ (ino 261)
|--- x/ (ino 259)
|--- y/ (ino 260)
Send snapshot:
. (ino 256)
|--- a/ (ino 257)
|
|--- b/ (ino 258)
|
|--- c/ (ino 262)
| |--- y/ (ino 260)
|
|--- d/ (ino 263)
|--- x/ (ino 259)
1) When processing inodes 259 and 260, we end up delaying their rename
operations because their parents, inodes 263 and 262 respectively, were
not yet processed and therefore not yet renamed;
2) When processing inode 262, its rename operation is issued and right
after the rename operation for inode 260 is issued. However right after
issuing the rename operation for inode 260, at send.c:apply_dir_move(),
we issue utimes operations for all current and past parents of inode
260. This means we try to send a utimes operation for its old parent,
inode 261 (deleted in the send snapshot), which does not cause any
immediate and deterministic failure, because when the target inode is
not found in the send snapshot, the send.c:send_utimes() function
ignores it and uses the leaf region pointed to by path->slots[0],
which can be any unrelated item (belonging to other inode) or it can
be a region outside the leaf boundaries, if the leaf is full and
path->slots[0] matches the number of items in the leaf. So we end
up either successfully sending a utimes operation, which is fine
and irrelevant because the old parent (inode 261) will end up being
deleted later, or we end up doing an invalid memory access tha
crashes the kernel.
So fix this by making apply_dir_move() issue utimes operations only for
parents that still exist in the send snapshot. In a separate patch we
will make send_utimes() return an error (-ENOENT) if the given inode
does not exists in the send snapshot.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and better organized]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:50 +08:00
|
|
|
}
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
ret = send_utimes(sctx, cur->dir, cur->dir_gen);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2014-02-16 21:43:11 +08:00
|
|
|
fs_path_free(name);
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
fs_path_free(from_path);
|
|
|
|
fs_path_free(to_path);
|
|
|
|
sctx->send_progress = orig_progress;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
|
|
|
|
{
|
|
|
|
if (!list_empty(&m->list))
|
|
|
|
list_del(&m->list);
|
|
|
|
if (!RB_EMPTY_NODE(&m->node))
|
|
|
|
rb_erase(&m->node, &sctx->pending_dir_moves);
|
|
|
|
__free_recorded_refs(&m->update_refs);
|
|
|
|
kfree(m);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void tail_append_pending_moves(struct pending_dir_move *moves,
|
|
|
|
struct list_head *stack)
|
|
|
|
{
|
|
|
|
if (list_empty(&moves->list)) {
|
|
|
|
list_add_tail(&moves->list, stack);
|
|
|
|
} else {
|
|
|
|
LIST_HEAD(list);
|
|
|
|
list_splice_init(&moves->list, &list);
|
|
|
|
list_add_tail(&moves->list, stack);
|
|
|
|
list_splice_tail(&list, stack);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int apply_children_dir_moves(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
struct pending_dir_move *pm;
|
|
|
|
struct list_head stack;
|
|
|
|
u64 parent_ino = sctx->cur_ino;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
pm = get_pending_dir_moves(sctx, parent_ino);
|
|
|
|
if (!pm)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&stack);
|
|
|
|
tail_append_pending_moves(pm, &stack);
|
|
|
|
|
|
|
|
while (!list_empty(&stack)) {
|
|
|
|
pm = list_first_entry(&stack, struct pending_dir_move, list);
|
|
|
|
parent_ino = pm->ino;
|
|
|
|
ret = apply_dir_move(sctx, pm);
|
|
|
|
free_pending_move(sctx, pm);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
pm = get_pending_dir_moves(sctx, parent_ino);
|
|
|
|
if (pm)
|
|
|
|
tail_append_pending_moves(pm, &stack);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
while (!list_empty(&stack)) {
|
|
|
|
pm = list_first_entry(&stack, struct pending_dir_move, list);
|
|
|
|
free_pending_move(sctx, pm);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-03-01 06:29:22 +08:00
|
|
|
/*
|
|
|
|
* We might need to delay a directory rename even when no ancestor directory
|
|
|
|
* (in the send root) with a higher inode number than ours (sctx->cur_ino) was
|
|
|
|
* renamed. This happens when we rename a directory to the old name (the name
|
|
|
|
* in the parent root) of some other unrelated directory that got its rename
|
|
|
|
* delayed due to some ancestor with higher number that got renamed.
|
|
|
|
*
|
|
|
|
* Example:
|
|
|
|
*
|
|
|
|
* Parent snapshot:
|
|
|
|
* . (ino 256)
|
|
|
|
* |---- a/ (ino 257)
|
|
|
|
* | |---- file (ino 260)
|
|
|
|
* |
|
|
|
|
* |---- b/ (ino 258)
|
|
|
|
* |---- c/ (ino 259)
|
|
|
|
*
|
|
|
|
* Send snapshot:
|
|
|
|
* . (ino 256)
|
|
|
|
* |---- a/ (ino 258)
|
|
|
|
* |---- x/ (ino 259)
|
|
|
|
* |---- y/ (ino 257)
|
|
|
|
* |----- file (ino 260)
|
|
|
|
*
|
|
|
|
* Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
|
|
|
|
* from 'a' to 'x/y' happening first, which in turn depends on the rename of
|
|
|
|
* inode 259 from 'c' to 'x'. So the order of rename commands the send stream
|
|
|
|
* must issue is:
|
|
|
|
*
|
|
|
|
* 1 - rename 259 from 'c' to 'x'
|
|
|
|
* 2 - rename 257 from 'a' to 'x/y'
|
|
|
|
* 3 - rename 258 from 'b' to 'a'
|
|
|
|
*
|
|
|
|
* Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
|
|
|
|
* be done right away and < 0 on error.
|
|
|
|
*/
|
|
|
|
static int wait_for_dest_dir_move(struct send_ctx *sctx,
|
|
|
|
struct recorded_ref *parent_ref,
|
|
|
|
const bool is_orphan)
|
|
|
|
{
|
2016-06-23 06:54:24 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
|
2015-03-01 06:29:22 +08:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key di_key;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
u64 left_gen;
|
|
|
|
u64 right_gen;
|
|
|
|
int ret = 0;
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
struct waiting_dir_move *wdm;
|
2015-03-01 06:29:22 +08:00
|
|
|
|
|
|
|
if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = parent_ref->dir;
|
|
|
|
key.type = BTRFS_DIR_ITEM_KEY;
|
|
|
|
key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
} else if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
|
|
|
|
parent_ref->name_len);
|
2015-03-01 06:29:22 +08:00
|
|
|
if (!di) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* di_key.objectid has the number of the inode that has a dentry in the
|
|
|
|
* parent directory with the same name that sctx->cur_ino is being
|
|
|
|
* renamed to. We need to check if that inode is in the send root as
|
|
|
|
* well and if it is currently marked as an inode with a pending rename,
|
|
|
|
* if it is, we need to delay the rename of sctx->cur_ino as well, so
|
|
|
|
* that it happens after that other inode is renamed.
|
|
|
|
*/
|
|
|
|
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
|
|
|
|
if (di_key.type != BTRFS_INODE_ITEM_KEY) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
|
|
|
|
&left_gen, NULL, NULL, NULL, NULL);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
|
|
|
|
&right_gen, NULL, NULL, NULL, NULL);
|
|
|
|
if (ret < 0) {
|
|
|
|
if (ret == -ENOENT)
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Different inode, no need to delay the rename of sctx->cur_ino */
|
|
|
|
if (right_gen != left_gen) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
wdm = get_waiting_dir_move(sctx, di_key.objectid);
|
|
|
|
if (wdm && !wdm->orphanized) {
|
2015-03-01 06:29:22 +08:00
|
|
|
ret = add_pending_dir_move(sctx,
|
|
|
|
sctx->cur_ino,
|
|
|
|
sctx->cur_inode_gen,
|
|
|
|
di_key.objectid,
|
|
|
|
&sctx->new_refs,
|
|
|
|
&sctx->deleted_refs,
|
|
|
|
is_orphan);
|
|
|
|
if (!ret)
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-03-28 01:50:45 +08:00
|
|
|
/*
|
|
|
|
* Check if ino ino1 is an ancestor of inode ino2 in the given root.
|
|
|
|
* Return 1 if true, 0 if false and < 0 on error.
|
|
|
|
*/
|
|
|
|
static int is_ancestor(struct btrfs_root *root,
|
|
|
|
const u64 ino1,
|
|
|
|
const u64 ino1_gen,
|
|
|
|
const u64 ino2,
|
|
|
|
struct fs_path *fs_path)
|
|
|
|
{
|
|
|
|
u64 ino = ino2;
|
|
|
|
|
|
|
|
while (ino > BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
int ret;
|
|
|
|
u64 parent;
|
|
|
|
u64 parent_gen;
|
|
|
|
|
|
|
|
fs_path_reset(fs_path);
|
|
|
|
ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
|
|
|
|
if (ret < 0) {
|
|
|
|
if (ret == -ENOENT && ino == ino2)
|
|
|
|
ret = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
if (parent == ino1)
|
|
|
|
return parent_gen == ino1_gen ? 1 : 0;
|
|
|
|
ino = parent;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
static int wait_for_parent_move(struct send_ctx *sctx,
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
struct recorded_ref *parent_ref,
|
|
|
|
const bool is_orphan)
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
{
|
2014-03-28 04:14:01 +08:00
|
|
|
int ret = 0;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
u64 ino = parent_ref->dir;
|
Btrfs: incremental send, do not delay rename when parent inode is new
When we are checking if we need to delay the rename operation for an
inode we not checking if a parent inode that exists in the send and
parent snapshots is really the same inode or not, that is, we are not
comparing the generation number of the parent inode in the send and
parent snapshots. Not only this results in unnecessarily delaying a
rename operation but also can later on make us generate an incorrect
name for a new inode in the send snapshot that has the same number
as another inode in the parent snapshot but a different generation.
Here follows an example where this happens.
Parent snapshot:
. (ino 256, gen 3)
|--- dir258/ (ino 258, gen 7)
| |--- dir257/ (ino 257, gen 7)
|
|--- dir259/ (ino 259, gen 7)
Send snapshot:
. (ino 256, gen 3)
|--- file258 (ino 258, gen 10)
|
|--- new_dir259/ (ino 259, gen 10)
|--- dir257/ (ino 257, gen 7)
The following steps happen when computing the incremental send stream:
1) When processing inode 257, its new parent is created using its orphan
name (o257-21-0), and the rename operation for inode 257 is delayed
because its new parent (inode 259) was not yet processed - this
decision to delay the rename operation does not make much sense
because the inode 259 in the send snapshot is a new inode, it's not
the same as inode 259 in the parent snapshot.
2) When processing inode 258 we end up delaying its rmdir operation,
because inode 257 was not yet renamed (moved away from the directory
inode 258 represents). We also create the new inode 258 using its
orphan name "o258-10-0", then rename it to its final name of "file258"
and then issue a truncate operation for it. However this truncate
operation contains an incorrect name, which corresponds to the orphan
name and not to the final name, which makes the receiver fail. This
happens because when we attempt to compute the inode's current name
we verify that there's another inode with the same number (258) that
has its rmdir operation pending and because of that we generate an
orphan name for the new inode 258 (we do this in the function
get_cur_path()).
Fix this by not delayed the rename operation of an inode if it has parents
with the same number but different generations in both snapshots.
The following steps reproduce this example scenario.
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/dir257
$ mkdir /mnt/dir258
$ mkdir /mnt/dir259
$ mv /mnt/dir257 /mnt/dir258/dir257
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/dir258/dir257 /mnt/dir257
$ rmdir /mnt/dir258
$ rmdir /mnt/dir259
# Remount the filesystem so that the next created inodes will have the
# numbers 258 and 259. This is because when a filesystem is mounted,
# btrfs sets the subvolume's inode counter to a value corresponding to
# the highest inode number in the subvolume plus 1. This inode counter
# is used to assign a unique number to each new inode and it's
# incremented by 1 after very inode creation.
# Note: we unmount and then mount instead of doing a mount with
# "-o remount" because otherwise the inode counter remains at value 260.
$ umount /mnt
$ mount /dev/sdb /mnt
$ touch /mnt/file258
$ mkdir /mnt/new_dir259
$ mv /mnt/dir257 /mnt/new_dir259/dir257
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 -f /tmp/1.snap
$ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/2.snap
$ umount /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt
$ btrfs receive /mnt -f /tmo/1.snap
$ btrfs receive /mnt -f /tmo/2.snap -vv
receiving snapshot mysnap2 uuid=e059b6d1-7f55-f140-8d7c-9a3039d23c97, ctransid=10 parent_uuid=77e98cb6-8762-814f-9e05-e8ba877fc0b0, parent_ctransid=7
utimes
mkdir o259-10-0
rename dir258 -> o258-7-0
utimes
mkfile o258-10-0
rename o258-10-0 -> file258
utimes
truncate o258-10-0 size=0
ERROR: truncate o258-10-0 failed: No such file or directory
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-01-11 10:15:39 +08:00
|
|
|
u64 ino_gen = parent_ref->dir_gen;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
u64 parent_ino_before, parent_ino_after;
|
|
|
|
struct fs_path *path_before = NULL;
|
|
|
|
struct fs_path *path_after = NULL;
|
|
|
|
int len1, len2;
|
|
|
|
|
|
|
|
path_after = fs_path_alloc();
|
2014-03-28 04:14:01 +08:00
|
|
|
path_before = fs_path_alloc();
|
|
|
|
if (!path_after || !path_before) {
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2014-03-19 22:20:54 +08:00
|
|
|
/*
|
2014-03-28 04:14:01 +08:00
|
|
|
* Our current directory inode may not yet be renamed/moved because some
|
|
|
|
* ancestor (immediate or not) has to be renamed/moved first. So find if
|
|
|
|
* such ancestor exists and make sure our own rename/move happens after
|
2015-03-28 01:50:45 +08:00
|
|
|
* that ancestor is processed to avoid path build infinite loops (done
|
|
|
|
* at get_cur_path()).
|
2014-03-19 22:20:54 +08:00
|
|
|
*/
|
2014-03-28 04:14:01 +08:00
|
|
|
while (ino > BTRFS_FIRST_FREE_OBJECTID) {
|
Btrfs: incremental send, do not delay rename when parent inode is new
When we are checking if we need to delay the rename operation for an
inode we not checking if a parent inode that exists in the send and
parent snapshots is really the same inode or not, that is, we are not
comparing the generation number of the parent inode in the send and
parent snapshots. Not only this results in unnecessarily delaying a
rename operation but also can later on make us generate an incorrect
name for a new inode in the send snapshot that has the same number
as another inode in the parent snapshot but a different generation.
Here follows an example where this happens.
Parent snapshot:
. (ino 256, gen 3)
|--- dir258/ (ino 258, gen 7)
| |--- dir257/ (ino 257, gen 7)
|
|--- dir259/ (ino 259, gen 7)
Send snapshot:
. (ino 256, gen 3)
|--- file258 (ino 258, gen 10)
|
|--- new_dir259/ (ino 259, gen 10)
|--- dir257/ (ino 257, gen 7)
The following steps happen when computing the incremental send stream:
1) When processing inode 257, its new parent is created using its orphan
name (o257-21-0), and the rename operation for inode 257 is delayed
because its new parent (inode 259) was not yet processed - this
decision to delay the rename operation does not make much sense
because the inode 259 in the send snapshot is a new inode, it's not
the same as inode 259 in the parent snapshot.
2) When processing inode 258 we end up delaying its rmdir operation,
because inode 257 was not yet renamed (moved away from the directory
inode 258 represents). We also create the new inode 258 using its
orphan name "o258-10-0", then rename it to its final name of "file258"
and then issue a truncate operation for it. However this truncate
operation contains an incorrect name, which corresponds to the orphan
name and not to the final name, which makes the receiver fail. This
happens because when we attempt to compute the inode's current name
we verify that there's another inode with the same number (258) that
has its rmdir operation pending and because of that we generate an
orphan name for the new inode 258 (we do this in the function
get_cur_path()).
Fix this by not delayed the rename operation of an inode if it has parents
with the same number but different generations in both snapshots.
The following steps reproduce this example scenario.
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/dir257
$ mkdir /mnt/dir258
$ mkdir /mnt/dir259
$ mv /mnt/dir257 /mnt/dir258/dir257
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/dir258/dir257 /mnt/dir257
$ rmdir /mnt/dir258
$ rmdir /mnt/dir259
# Remount the filesystem so that the next created inodes will have the
# numbers 258 and 259. This is because when a filesystem is mounted,
# btrfs sets the subvolume's inode counter to a value corresponding to
# the highest inode number in the subvolume plus 1. This inode counter
# is used to assign a unique number to each new inode and it's
# incremented by 1 after very inode creation.
# Note: we unmount and then mount instead of doing a mount with
# "-o remount" because otherwise the inode counter remains at value 260.
$ umount /mnt
$ mount /dev/sdb /mnt
$ touch /mnt/file258
$ mkdir /mnt/new_dir259
$ mv /mnt/dir257 /mnt/new_dir259/dir257
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 -f /tmp/1.snap
$ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/2.snap
$ umount /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt
$ btrfs receive /mnt -f /tmo/1.snap
$ btrfs receive /mnt -f /tmo/2.snap -vv
receiving snapshot mysnap2 uuid=e059b6d1-7f55-f140-8d7c-9a3039d23c97, ctransid=10 parent_uuid=77e98cb6-8762-814f-9e05-e8ba877fc0b0, parent_ctransid=7
utimes
mkdir o259-10-0
rename dir258 -> o258-7-0
utimes
mkfile o258-10-0
rename o258-10-0 -> file258
utimes
truncate o258-10-0 size=0
ERROR: truncate o258-10-0 failed: No such file or directory
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-01-11 10:15:39 +08:00
|
|
|
u64 parent_ino_after_gen;
|
|
|
|
|
2014-03-28 04:14:01 +08:00
|
|
|
if (is_waiting_for_move(sctx, ino)) {
|
2015-03-28 01:50:45 +08:00
|
|
|
/*
|
|
|
|
* If the current inode is an ancestor of ino in the
|
|
|
|
* parent root, we need to delay the rename of the
|
|
|
|
* current inode, otherwise don't delayed the rename
|
|
|
|
* because we can end up with a circular dependency
|
|
|
|
* of renames, resulting in some directories never
|
|
|
|
* getting the respective rename operations issued in
|
|
|
|
* the send stream or getting into infinite path build
|
|
|
|
* loops.
|
|
|
|
*/
|
|
|
|
ret = is_ancestor(sctx->parent_root,
|
|
|
|
sctx->cur_ino, sctx->cur_inode_gen,
|
|
|
|
ino, path_before);
|
Btrfs: incremental send, fix invalid paths for rename operations
Example scenario:
Parent snapshot:
. (ino 277)
|---- tmp/ (ino 278)
|---- pre/ (ino 280)
| |---- wait_dir/ (ino 281)
|
|---- desc/ (ino 282)
|---- ance/ (ino 283)
| |---- below_ance/ (ino 279)
|
|---- other_dir/ (ino 284)
Send snapshot:
. (ino 277)
|---- tmp/ (ino 278)
|---- other_dir/ (ino 284)
|---- below_ance/ (ino 279)
| |---- pre/ (ino 280)
|
|---- wait_dir/ (ino 281)
|---- desc/ (ino 282)
|---- ance/ (ino 283)
While computing the send stream the following steps happen:
1) While processing inode 279 we end up delaying its rename operation
because its new parent in the send snapshot, inode 284, was not
yet processed and therefore not yet renamed;
2) Later when processing inode 280 we end up renaming it immediately to
"ance/below_once/pre" and not delay its rename operation because its
new parent (inode 279 in the send snapshot) has its rename operation
delayed and inode 280 is not an encestor of inode 279 (its parent in
the send snapshot) in the parent snapshot;
3) When processing inode 281 we end up delaying its rename operation
because its new parent in the send snapshot, inode 284, was not yet
processed and therefore not yet renamed;
4) When processing inode 282 we do not delay its rename operation because
its parent in the send snapshot, inode 281, already has its own rename
operation delayed and our current inode (282) is not an ancestor of
inode 281 in the parent snapshot. Therefore inode 282 is renamed to
"ance/below_ance/pre/wait_dir";
5) When processing inode 283 we realize that we can rename it because one
of its ancestors in the send snapshot, inode 281, has its rename
operation delayed and inode 283 is not an ancestor of inode 281 in the
parent snapshot. So a rename operation to rename inode 283 to
"ance/below_ance/pre/wait_dir/desc/ance" is issued. This path is
invalid due to a missing path building loop that was undetected by
the incremental send implementation, as inode 283 ends up getting
included twice in the path (once with its path in the parent snapshot).
Therefore its rename operation must wait before the ancestor inode 284
is renamed.
Fix this by not terminating the rename dependency checks when we find an
ancestor, in the send snapshot, that has its rename operation delayed. So
that we continue doing the same checks if the current inode is not an
ancestor, in the parent snapshot, of an ancestor in the send snapshot we
are processing in the loop.
The problem and reproducer were reported by Robbie Ko, as part of a patch
titled "Btrfs: incremental send, avoid ancestor rename to descendant".
However the fix was unnecessarily complicated and can be addressed with
much less code and effort.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2016-06-27 23:54:44 +08:00
|
|
|
if (ret)
|
|
|
|
break;
|
2014-03-28 04:14:01 +08:00
|
|
|
}
|
2014-03-19 22:20:54 +08:00
|
|
|
|
|
|
|
fs_path_reset(path_before);
|
|
|
|
fs_path_reset(path_after);
|
|
|
|
|
|
|
|
ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
|
Btrfs: incremental send, do not delay rename when parent inode is new
When we are checking if we need to delay the rename operation for an
inode we not checking if a parent inode that exists in the send and
parent snapshots is really the same inode or not, that is, we are not
comparing the generation number of the parent inode in the send and
parent snapshots. Not only this results in unnecessarily delaying a
rename operation but also can later on make us generate an incorrect
name for a new inode in the send snapshot that has the same number
as another inode in the parent snapshot but a different generation.
Here follows an example where this happens.
Parent snapshot:
. (ino 256, gen 3)
|--- dir258/ (ino 258, gen 7)
| |--- dir257/ (ino 257, gen 7)
|
|--- dir259/ (ino 259, gen 7)
Send snapshot:
. (ino 256, gen 3)
|--- file258 (ino 258, gen 10)
|
|--- new_dir259/ (ino 259, gen 10)
|--- dir257/ (ino 257, gen 7)
The following steps happen when computing the incremental send stream:
1) When processing inode 257, its new parent is created using its orphan
name (o257-21-0), and the rename operation for inode 257 is delayed
because its new parent (inode 259) was not yet processed - this
decision to delay the rename operation does not make much sense
because the inode 259 in the send snapshot is a new inode, it's not
the same as inode 259 in the parent snapshot.
2) When processing inode 258 we end up delaying its rmdir operation,
because inode 257 was not yet renamed (moved away from the directory
inode 258 represents). We also create the new inode 258 using its
orphan name "o258-10-0", then rename it to its final name of "file258"
and then issue a truncate operation for it. However this truncate
operation contains an incorrect name, which corresponds to the orphan
name and not to the final name, which makes the receiver fail. This
happens because when we attempt to compute the inode's current name
we verify that there's another inode with the same number (258) that
has its rmdir operation pending and because of that we generate an
orphan name for the new inode 258 (we do this in the function
get_cur_path()).
Fix this by not delayed the rename operation of an inode if it has parents
with the same number but different generations in both snapshots.
The following steps reproduce this example scenario.
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/dir257
$ mkdir /mnt/dir258
$ mkdir /mnt/dir259
$ mv /mnt/dir257 /mnt/dir258/dir257
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/dir258/dir257 /mnt/dir257
$ rmdir /mnt/dir258
$ rmdir /mnt/dir259
# Remount the filesystem so that the next created inodes will have the
# numbers 258 and 259. This is because when a filesystem is mounted,
# btrfs sets the subvolume's inode counter to a value corresponding to
# the highest inode number in the subvolume plus 1. This inode counter
# is used to assign a unique number to each new inode and it's
# incremented by 1 after very inode creation.
# Note: we unmount and then mount instead of doing a mount with
# "-o remount" because otherwise the inode counter remains at value 260.
$ umount /mnt
$ mount /dev/sdb /mnt
$ touch /mnt/file258
$ mkdir /mnt/new_dir259
$ mv /mnt/dir257 /mnt/new_dir259/dir257
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 -f /tmp/1.snap
$ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/2.snap
$ umount /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt
$ btrfs receive /mnt -f /tmo/1.snap
$ btrfs receive /mnt -f /tmo/2.snap -vv
receiving snapshot mysnap2 uuid=e059b6d1-7f55-f140-8d7c-9a3039d23c97, ctransid=10 parent_uuid=77e98cb6-8762-814f-9e05-e8ba877fc0b0, parent_ctransid=7
utimes
mkdir o259-10-0
rename dir258 -> o258-7-0
utimes
mkfile o258-10-0
rename o258-10-0 -> file258
utimes
truncate o258-10-0 size=0
ERROR: truncate o258-10-0 failed: No such file or directory
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-01-11 10:15:39 +08:00
|
|
|
&parent_ino_after_gen, path_after);
|
2014-03-19 22:20:54 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
|
|
|
|
NULL, path_before);
|
2014-03-28 04:14:01 +08:00
|
|
|
if (ret < 0 && ret != -ENOENT) {
|
2014-03-19 22:20:54 +08:00
|
|
|
goto out;
|
2014-03-28 04:14:01 +08:00
|
|
|
} else if (ret == -ENOENT) {
|
2014-10-03 02:17:32 +08:00
|
|
|
ret = 0;
|
2014-03-28 04:14:01 +08:00
|
|
|
break;
|
2014-03-19 22:20:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
len1 = fs_path_len(path_before);
|
|
|
|
len2 = fs_path_len(path_after);
|
2014-03-28 04:14:01 +08:00
|
|
|
if (ino > sctx->cur_ino &&
|
|
|
|
(parent_ino_before != parent_ino_after || len1 != len2 ||
|
|
|
|
memcmp(path_before->start, path_after->start, len1))) {
|
Btrfs: incremental send, do not delay rename when parent inode is new
When we are checking if we need to delay the rename operation for an
inode we not checking if a parent inode that exists in the send and
parent snapshots is really the same inode or not, that is, we are not
comparing the generation number of the parent inode in the send and
parent snapshots. Not only this results in unnecessarily delaying a
rename operation but also can later on make us generate an incorrect
name for a new inode in the send snapshot that has the same number
as another inode in the parent snapshot but a different generation.
Here follows an example where this happens.
Parent snapshot:
. (ino 256, gen 3)
|--- dir258/ (ino 258, gen 7)
| |--- dir257/ (ino 257, gen 7)
|
|--- dir259/ (ino 259, gen 7)
Send snapshot:
. (ino 256, gen 3)
|--- file258 (ino 258, gen 10)
|
|--- new_dir259/ (ino 259, gen 10)
|--- dir257/ (ino 257, gen 7)
The following steps happen when computing the incremental send stream:
1) When processing inode 257, its new parent is created using its orphan
name (o257-21-0), and the rename operation for inode 257 is delayed
because its new parent (inode 259) was not yet processed - this
decision to delay the rename operation does not make much sense
because the inode 259 in the send snapshot is a new inode, it's not
the same as inode 259 in the parent snapshot.
2) When processing inode 258 we end up delaying its rmdir operation,
because inode 257 was not yet renamed (moved away from the directory
inode 258 represents). We also create the new inode 258 using its
orphan name "o258-10-0", then rename it to its final name of "file258"
and then issue a truncate operation for it. However this truncate
operation contains an incorrect name, which corresponds to the orphan
name and not to the final name, which makes the receiver fail. This
happens because when we attempt to compute the inode's current name
we verify that there's another inode with the same number (258) that
has its rmdir operation pending and because of that we generate an
orphan name for the new inode 258 (we do this in the function
get_cur_path()).
Fix this by not delayed the rename operation of an inode if it has parents
with the same number but different generations in both snapshots.
The following steps reproduce this example scenario.
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/dir257
$ mkdir /mnt/dir258
$ mkdir /mnt/dir259
$ mv /mnt/dir257 /mnt/dir258/dir257
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/dir258/dir257 /mnt/dir257
$ rmdir /mnt/dir258
$ rmdir /mnt/dir259
# Remount the filesystem so that the next created inodes will have the
# numbers 258 and 259. This is because when a filesystem is mounted,
# btrfs sets the subvolume's inode counter to a value corresponding to
# the highest inode number in the subvolume plus 1. This inode counter
# is used to assign a unique number to each new inode and it's
# incremented by 1 after very inode creation.
# Note: we unmount and then mount instead of doing a mount with
# "-o remount" because otherwise the inode counter remains at value 260.
$ umount /mnt
$ mount /dev/sdb /mnt
$ touch /mnt/file258
$ mkdir /mnt/new_dir259
$ mv /mnt/dir257 /mnt/new_dir259/dir257
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 -f /tmp/1.snap
$ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/2.snap
$ umount /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt
$ btrfs receive /mnt -f /tmo/1.snap
$ btrfs receive /mnt -f /tmo/2.snap -vv
receiving snapshot mysnap2 uuid=e059b6d1-7f55-f140-8d7c-9a3039d23c97, ctransid=10 parent_uuid=77e98cb6-8762-814f-9e05-e8ba877fc0b0, parent_ctransid=7
utimes
mkdir o259-10-0
rename dir258 -> o258-7-0
utimes
mkfile o258-10-0
rename o258-10-0 -> file258
utimes
truncate o258-10-0 size=0
ERROR: truncate o258-10-0 failed: No such file or directory
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-01-11 10:15:39 +08:00
|
|
|
u64 parent_ino_gen;
|
|
|
|
|
|
|
|
ret = get_inode_info(sctx->parent_root, ino, NULL,
|
|
|
|
&parent_ino_gen, NULL, NULL, NULL,
|
|
|
|
NULL);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ino_gen == parent_ino_gen) {
|
|
|
|
ret = 1;
|
|
|
|
break;
|
|
|
|
}
|
2014-03-19 22:20:54 +08:00
|
|
|
}
|
|
|
|
ino = parent_ino_after;
|
Btrfs: incremental send, do not delay rename when parent inode is new
When we are checking if we need to delay the rename operation for an
inode we not checking if a parent inode that exists in the send and
parent snapshots is really the same inode or not, that is, we are not
comparing the generation number of the parent inode in the send and
parent snapshots. Not only this results in unnecessarily delaying a
rename operation but also can later on make us generate an incorrect
name for a new inode in the send snapshot that has the same number
as another inode in the parent snapshot but a different generation.
Here follows an example where this happens.
Parent snapshot:
. (ino 256, gen 3)
|--- dir258/ (ino 258, gen 7)
| |--- dir257/ (ino 257, gen 7)
|
|--- dir259/ (ino 259, gen 7)
Send snapshot:
. (ino 256, gen 3)
|--- file258 (ino 258, gen 10)
|
|--- new_dir259/ (ino 259, gen 10)
|--- dir257/ (ino 257, gen 7)
The following steps happen when computing the incremental send stream:
1) When processing inode 257, its new parent is created using its orphan
name (o257-21-0), and the rename operation for inode 257 is delayed
because its new parent (inode 259) was not yet processed - this
decision to delay the rename operation does not make much sense
because the inode 259 in the send snapshot is a new inode, it's not
the same as inode 259 in the parent snapshot.
2) When processing inode 258 we end up delaying its rmdir operation,
because inode 257 was not yet renamed (moved away from the directory
inode 258 represents). We also create the new inode 258 using its
orphan name "o258-10-0", then rename it to its final name of "file258"
and then issue a truncate operation for it. However this truncate
operation contains an incorrect name, which corresponds to the orphan
name and not to the final name, which makes the receiver fail. This
happens because when we attempt to compute the inode's current name
we verify that there's another inode with the same number (258) that
has its rmdir operation pending and because of that we generate an
orphan name for the new inode 258 (we do this in the function
get_cur_path()).
Fix this by not delayed the rename operation of an inode if it has parents
with the same number but different generations in both snapshots.
The following steps reproduce this example scenario.
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/dir257
$ mkdir /mnt/dir258
$ mkdir /mnt/dir259
$ mv /mnt/dir257 /mnt/dir258/dir257
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/dir258/dir257 /mnt/dir257
$ rmdir /mnt/dir258
$ rmdir /mnt/dir259
# Remount the filesystem so that the next created inodes will have the
# numbers 258 and 259. This is because when a filesystem is mounted,
# btrfs sets the subvolume's inode counter to a value corresponding to
# the highest inode number in the subvolume plus 1. This inode counter
# is used to assign a unique number to each new inode and it's
# incremented by 1 after very inode creation.
# Note: we unmount and then mount instead of doing a mount with
# "-o remount" because otherwise the inode counter remains at value 260.
$ umount /mnt
$ mount /dev/sdb /mnt
$ touch /mnt/file258
$ mkdir /mnt/new_dir259
$ mv /mnt/dir257 /mnt/new_dir259/dir257
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 -f /tmp/1.snap
$ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/2.snap
$ umount /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt
$ btrfs receive /mnt -f /tmo/1.snap
$ btrfs receive /mnt -f /tmo/2.snap -vv
receiving snapshot mysnap2 uuid=e059b6d1-7f55-f140-8d7c-9a3039d23c97, ctransid=10 parent_uuid=77e98cb6-8762-814f-9e05-e8ba877fc0b0, parent_ctransid=7
utimes
mkdir o259-10-0
rename dir258 -> o258-7-0
utimes
mkfile o258-10-0
rename o258-10-0 -> file258
utimes
truncate o258-10-0 size=0
ERROR: truncate o258-10-0 failed: No such file or directory
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-01-11 10:15:39 +08:00
|
|
|
ino_gen = parent_ino_after_gen;
|
2014-03-19 22:20:54 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
out:
|
|
|
|
fs_path_free(path_before);
|
|
|
|
fs_path_free(path_after);
|
|
|
|
|
2014-03-28 04:14:01 +08:00
|
|
|
if (ret == 1) {
|
|
|
|
ret = add_pending_dir_move(sctx,
|
|
|
|
sctx->cur_ino,
|
|
|
|
sctx->cur_inode_gen,
|
|
|
|
ino,
|
|
|
|
&sctx->new_refs,
|
2015-03-01 06:29:22 +08:00
|
|
|
&sctx->deleted_refs,
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
is_orphan);
|
2014-03-28 04:14:01 +08:00
|
|
|
if (!ret)
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
/*
|
|
|
|
* This does all the move/link/unlink/rmdir magic.
|
|
|
|
*/
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret = 0;
|
|
|
|
struct recorded_ref *cur;
|
2012-07-28 16:42:24 +08:00
|
|
|
struct recorded_ref *cur2;
|
2013-08-17 04:52:55 +08:00
|
|
|
struct list_head check_dirs;
|
2012-07-26 05:19:24 +08:00
|
|
|
struct fs_path *valid_path = NULL;
|
2012-07-26 07:21:10 +08:00
|
|
|
u64 ow_inode = 0;
|
2012-07-26 05:19:24 +08:00
|
|
|
u64 ow_gen;
|
|
|
|
int did_overwrite = 0;
|
|
|
|
int is_orphan = 0;
|
2014-02-17 05:01:39 +08:00
|
|
|
u64 last_dir_ino_rm = 0;
|
2015-03-01 06:29:22 +08:00
|
|
|
bool can_rename = true;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2012-08-01 20:48:59 +08:00
|
|
|
/*
|
|
|
|
* This should never happen as the root dir always has the same ref
|
|
|
|
* which is always '..'
|
|
|
|
*/
|
|
|
|
BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
|
2013-08-17 04:52:55 +08:00
|
|
|
INIT_LIST_HEAD(&check_dirs);
|
2012-08-01 20:48:59 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
valid_path = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!valid_path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* First, check if the first ref of the current inode was overwritten
|
|
|
|
* before. If yes, we know that the current inode was already orphanized
|
|
|
|
* and thus use the orphan name. If not, we can use get_cur_path to
|
|
|
|
* get the path of the first ref as it would like while receiving at
|
|
|
|
* this point in time.
|
|
|
|
* New inodes are always orphan at the beginning, so force to use the
|
|
|
|
* orphan name in this case.
|
|
|
|
* The first ref is stored in valid_path and will be updated if it
|
|
|
|
* gets moved around.
|
|
|
|
*/
|
|
|
|
if (!sctx->cur_inode_new) {
|
|
|
|
ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
|
|
|
|
sctx->cur_inode_gen);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret)
|
|
|
|
did_overwrite = 1;
|
|
|
|
}
|
|
|
|
if (sctx->cur_inode_new || did_overwrite) {
|
|
|
|
ret = gen_unique_name(sctx, sctx->cur_ino,
|
|
|
|
sctx->cur_inode_gen, valid_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
is_orphan = 1;
|
|
|
|
} else {
|
|
|
|
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
|
|
|
|
valid_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
list_for_each_entry(cur, &sctx->new_refs, list) {
|
2012-07-28 16:42:24 +08:00
|
|
|
/*
|
|
|
|
* We may have refs where the parent directory does not exist
|
|
|
|
* yet. This happens if the parent directories inum is higher
|
|
|
|
* the the current inum. To handle this case, we create the
|
|
|
|
* parent directory out of order. But we need to check if this
|
|
|
|
* did already happen before due to other refs in the same dir.
|
|
|
|
*/
|
|
|
|
ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret == inode_state_will_create) {
|
|
|
|
ret = 0;
|
|
|
|
/*
|
|
|
|
* First check if any of the current inodes refs did
|
|
|
|
* already create the dir.
|
|
|
|
*/
|
|
|
|
list_for_each_entry(cur2, &sctx->new_refs, list) {
|
|
|
|
if (cur == cur2)
|
|
|
|
break;
|
|
|
|
if (cur2->dir == cur->dir) {
|
|
|
|
ret = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If that did not happen, check if a previous inode
|
|
|
|
* did already create the dir.
|
|
|
|
*/
|
|
|
|
if (!ret)
|
|
|
|
ret = did_create_dir(sctx, cur->dir);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (!ret) {
|
|
|
|
ret = send_create_inode(sctx, cur->dir);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
/*
|
|
|
|
* Check if this new ref would overwrite the first ref of
|
|
|
|
* another unprocessed inode. If yes, orphanize the
|
|
|
|
* overwritten inode. If we find an overwritten ref that is
|
|
|
|
* not the first ref, simply unlink it.
|
|
|
|
*/
|
|
|
|
ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
|
|
|
|
cur->name, cur->name_len,
|
|
|
|
&ow_inode, &ow_gen);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = is_first_ref(sctx->parent_root,
|
|
|
|
ow_inode, cur->dir, cur->name,
|
|
|
|
cur->name_len);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
2015-03-12 23:16:20 +08:00
|
|
|
struct name_cache_entry *nce;
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
struct waiting_dir_move *wdm;
|
2015-03-12 23:16:20 +08:00
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = orphanize_inode(sctx, ow_inode, ow_gen,
|
|
|
|
cur->full_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If ow_inode has its rename operation delayed
|
|
|
|
* make sure that its orphanized name is used in
|
|
|
|
* the source path when performing its rename
|
|
|
|
* operation.
|
|
|
|
*/
|
|
|
|
if (is_waiting_for_move(sctx, ow_inode)) {
|
|
|
|
wdm = get_waiting_dir_move(sctx,
|
|
|
|
ow_inode);
|
|
|
|
ASSERT(wdm);
|
|
|
|
wdm->orphanized = true;
|
|
|
|
}
|
|
|
|
|
2015-03-12 23:16:20 +08:00
|
|
|
/*
|
|
|
|
* Make sure we clear our orphanized inode's
|
|
|
|
* name from the name cache. This is because the
|
|
|
|
* inode ow_inode might be an ancestor of some
|
|
|
|
* other inode that will be orphanized as well
|
|
|
|
* later and has an inode number greater than
|
|
|
|
* sctx->send_progress. We need to prevent
|
|
|
|
* future name lookups from using the old name
|
|
|
|
* and get instead the orphan name.
|
|
|
|
*/
|
|
|
|
nce = name_cache_search(sctx, ow_inode, ow_gen);
|
|
|
|
if (nce) {
|
|
|
|
name_cache_delete(sctx, nce);
|
|
|
|
kfree(nce);
|
|
|
|
}
|
Btrfs: send, fix failure to move directories with the same name around
When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.
For example, consider the following scenario:
Parent snapshot:
. (ino 256)
|---- d/ (ino 257)
| |--- p1/ (ino 258)
|
|---- p1/ (ino 259)
Send snapshot:
. (ino 256)
|--- d/ (ino 257)
|--- p1/ (ino 259)
|--- p1/ (ino 258)
The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:
[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136] [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G W 4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104] 0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104] 0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104] ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104] [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104] [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104] [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104] [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104] [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104] [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104] [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104] [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104] [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104] [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104] [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104] [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---
There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:
Parent snapshot:
.
|---- a/ (ino 262)
| |---- c/ (ino 268)
|
|---- d/ (ino 263)
|---- ance/ (ino 267)
|---- e/ (ino 264)
|---- f/ (ino 265)
|---- ance/ (ino 266)
Send snapshot:
.
|---- a/ (ino 262)
|---- c/ (ino 268)
| |---- ance/ (ino 267)
|
|---- d/ (ino 263)
| |---- ance/ (ino 266)
|
|---- f/ (ino 265)
|---- e/ (ino 264)
When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:
ERROR: rename d/ance/ance -> d/ance failed: No such file or directory
So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.
A test case for fstests will follow soon.
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
comments]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-23 18:39:46 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ow_inode might currently be an ancestor of
|
|
|
|
* cur_ino, therefore compute valid_path (the
|
|
|
|
* current path of cur_ino) again because it
|
|
|
|
* might contain the pre-orphanization name of
|
|
|
|
* ow_inode, which is no longer valid.
|
|
|
|
*/
|
|
|
|
fs_path_reset(valid_path);
|
|
|
|
ret = get_cur_path(sctx, sctx->cur_ino,
|
|
|
|
sctx->cur_inode_gen, valid_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-07-26 05:19:24 +08:00
|
|
|
} else {
|
|
|
|
ret = send_unlink(sctx, cur->full_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-01 06:29:22 +08:00
|
|
|
if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
|
|
|
|
ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret == 1) {
|
|
|
|
can_rename = false;
|
|
|
|
*pending_move = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
|
|
|
|
can_rename) {
|
|
|
|
ret = wait_for_parent_move(sctx, cur, is_orphan);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret == 1) {
|
|
|
|
can_rename = false;
|
|
|
|
*pending_move = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
/*
|
|
|
|
* link/move the ref to the new place. If we have an orphan
|
|
|
|
* inode, move it and update valid_path. If not, link or move
|
|
|
|
* it depending on the inode mode.
|
|
|
|
*/
|
2015-03-01 06:29:22 +08:00
|
|
|
if (is_orphan && can_rename) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = send_rename(sctx, valid_path, cur->full_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
is_orphan = 0;
|
|
|
|
ret = fs_path_copy(valid_path, cur->full_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2015-03-01 06:29:22 +08:00
|
|
|
} else if (can_rename) {
|
2012-07-26 05:19:24 +08:00
|
|
|
if (S_ISDIR(sctx->cur_inode_mode)) {
|
|
|
|
/*
|
|
|
|
* Dirs can't be linked, so move it. For moved
|
|
|
|
* dirs, we always have one new and one deleted
|
|
|
|
* ref. The deleted ref is ignored later.
|
|
|
|
*/
|
Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.
For example, for the following reproducer where this is needed (provided
by Robbie Ko):
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt2
$ mkdir -p /mnt/data/n1/n2
$ mkdir /mnt/data/n4
$ mkdir -p /mnt/data/t6/t7
$ mkdir /mnt/data/t5
$ mkdir /mnt/data/t7
$ mkdir /mnt/data/n4/t2
$ mkdir /mnt/data/t4
$ mkdir /mnt/data/t3
$ mv /mnt/data/t7 /mnt/data/n4/t2
$ mv /mnt/data/t4 /mnt/data/n4/t2/t7
$ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
$ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
$ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
$ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7
$ btrfs subvolume snapshot -r /mnt /mnt/snap1
$ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
$ mv /mnt/data/n4/t2 /mnt/data/n4/n1
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
$ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
$ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
$ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2
$ btrfs subvolume snapshot -r /mnt /mnt/snap2
$ btrfs send /mnt/snap1 | btrfs receive /mnt2
$ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
ERROR: send ioctl failed with -12: Cannot allocate memory
Where the parent snapshot directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- t2/ (ino 265)
|-- t7/ (ino 264)
|-- t4/ (ino 266)
|-- t5/ (ino 263)
|-- t6/ (ino 261)
|-- n1/ (ino 258)
|-- n2/ (ino 259)
|-- t7/ (ino 262)
|-- t3/ (ino 267)
And the send snapshot's directory hierarchy is the following:
. (ino 256)
|-- data/ (ino 257)
|-- n4/ (ino 260)
|-- n1/ (ino 258)
|-- t2/ (ino 265)
|-- n2/ (ino 259)
|-- t3/ (ino 267)
| |-- t7 (ino 264)
|
|-- t6/ (ino 261)
| |-- t4/ (ino 266)
| |-- t5/ (ino 263)
|
|-- t7/ (ino 262)
While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:
start inode 264, send progress of 265 for example
parent of 264 -> 267
parent of 267 -> 262
parent of 262 -> 259
parent of 259 -> 261
parent of 261 -> 263
parent of 263 -> 266
parent of 266 -> 264
|--> back to first iteration while current path string length
is <= PATH_MAX, and fail with -ENOMEM otherwise
So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.
A test case for fstests follows soon.
Thanks to Robbie Ko for providing a reproducer for this problem.
Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-04-09 21:09:14 +08:00
|
|
|
ret = send_rename(sctx, valid_path,
|
|
|
|
cur->full_path);
|
|
|
|
if (!ret)
|
|
|
|
ret = fs_path_copy(valid_path,
|
|
|
|
cur->full_path);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
ret = send_link(sctx, cur->full_path,
|
|
|
|
valid_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
2013-08-17 04:52:55 +08:00
|
|
|
ret = dup_ref(cur, &check_dirs);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
|
|
|
|
/*
|
|
|
|
* Check if we can already rmdir the directory. If not,
|
|
|
|
* orphanize it. For every dir item inside that gets deleted
|
|
|
|
* later, we do this check again and rmdir it then if possible.
|
|
|
|
* See the use of check_dirs for more details.
|
|
|
|
*/
|
2014-02-19 22:31:44 +08:00
|
|
|
ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
|
|
|
|
sctx->cur_ino);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
ret = send_rmdir(sctx, valid_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
} else if (!is_orphan) {
|
|
|
|
ret = orphanize_inode(sctx, sctx->cur_ino,
|
|
|
|
sctx->cur_inode_gen, valid_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
is_orphan = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
list_for_each_entry(cur, &sctx->deleted_refs, list) {
|
2013-08-17 04:52:55 +08:00
|
|
|
ret = dup_ref(cur, &check_dirs);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-28 17:46:29 +08:00
|
|
|
} else if (S_ISDIR(sctx->cur_inode_mode) &&
|
|
|
|
!list_empty(&sctx->deleted_refs)) {
|
|
|
|
/*
|
|
|
|
* We have a moved dir. Add the old parent to check_dirs
|
|
|
|
*/
|
|
|
|
cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
|
|
|
|
list);
|
2013-08-17 04:52:55 +08:00
|
|
|
ret = dup_ref(cur, &check_dirs);
|
2012-07-28 17:46:29 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-07-26 05:19:24 +08:00
|
|
|
} else if (!S_ISDIR(sctx->cur_inode_mode)) {
|
|
|
|
/*
|
|
|
|
* We have a non dir inode. Go through all deleted refs and
|
|
|
|
* unlink them if they were not already overwritten by other
|
|
|
|
* inodes.
|
|
|
|
*/
|
|
|
|
list_for_each_entry(cur, &sctx->deleted_refs, list) {
|
|
|
|
ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
|
|
|
|
sctx->cur_ino, sctx->cur_inode_gen,
|
|
|
|
cur->name, cur->name_len);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (!ret) {
|
2012-07-28 16:42:24 +08:00
|
|
|
ret = send_unlink(sctx, cur->full_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
2013-08-17 04:52:55 +08:00
|
|
|
ret = dup_ref(cur, &check_dirs);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If the inode is still orphan, unlink the orphan. This may
|
|
|
|
* happen when a previous inode did overwrite the first ref
|
|
|
|
* of this inode and no new refs were added for the current
|
2012-07-28 20:11:31 +08:00
|
|
|
* inode. Unlinking does not mean that the inode is deleted in
|
|
|
|
* all cases. There may still be links to this inode in other
|
|
|
|
* places.
|
2012-07-26 05:19:24 +08:00
|
|
|
*/
|
2012-07-28 16:42:24 +08:00
|
|
|
if (is_orphan) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = send_unlink(sctx, valid_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We did collect all parent dirs where cur_inode was once located. We
|
|
|
|
* now go through all these dirs and check if they are pending for
|
|
|
|
* deletion and if it's finally possible to perform the rmdir now.
|
|
|
|
* We also update the inode stats of the parent dirs here.
|
|
|
|
*/
|
2013-08-17 04:52:55 +08:00
|
|
|
list_for_each_entry(cur, &check_dirs, list) {
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* In case we had refs into dirs that were not processed yet,
|
|
|
|
* we don't need to do the utime and rmdir logic for these dirs.
|
|
|
|
* The dir will be processed later.
|
|
|
|
*/
|
2013-08-17 04:52:55 +08:00
|
|
|
if (cur->dir > sctx->cur_ino)
|
2012-07-26 05:19:24 +08:00
|
|
|
continue;
|
|
|
|
|
2013-08-17 04:52:55 +08:00
|
|
|
ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ret == inode_state_did_create ||
|
|
|
|
ret == inode_state_no_change) {
|
|
|
|
/* TODO delayed utimes */
|
2013-08-17 04:52:55 +08:00
|
|
|
ret = send_utimes(sctx, cur->dir, cur->dir_gen);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2014-02-17 05:01:39 +08:00
|
|
|
} else if (ret == inode_state_did_delete &&
|
|
|
|
cur->dir != last_dir_ino_rm) {
|
2014-02-19 22:31:44 +08:00
|
|
|
ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
|
|
|
|
sctx->cur_ino);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
2013-08-17 04:52:55 +08:00
|
|
|
ret = get_cur_path(sctx, cur->dir,
|
|
|
|
cur->dir_gen, valid_path);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = send_rmdir(sctx, valid_path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2014-02-17 05:01:39 +08:00
|
|
|
last_dir_ino_rm = cur->dir;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
2013-08-17 04:52:55 +08:00
|
|
|
__free_recorded_refs(&check_dirs);
|
2012-07-26 05:19:24 +08:00
|
|
|
free_recorded_refs(sctx);
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(valid_path);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-03-03 21:31:03 +08:00
|
|
|
static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
|
|
|
|
struct fs_path *name, void *ctx, struct list_head *refs)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct send_ctx *sctx = ctx;
|
|
|
|
struct fs_path *p;
|
|
|
|
u64 gen;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2014-03-03 21:31:03 +08:00
|
|
|
ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
|
2012-07-27 05:39:10 +08:00
|
|
|
NULL, NULL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = get_cur_path(sctx, dir, gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = fs_path_add_path(p, name);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2014-03-03 21:31:03 +08:00
|
|
|
ret = __record_ref(refs, dir, gen, p);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
if (ret)
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-03-03 21:31:03 +08:00
|
|
|
static int __record_new_ref(int num, u64 dir, int index,
|
|
|
|
struct fs_path *name,
|
|
|
|
void *ctx)
|
|
|
|
{
|
|
|
|
struct send_ctx *sctx = ctx;
|
|
|
|
return record_ref(sctx->send_root, num, dir, index, name,
|
|
|
|
ctx, &sctx->new_refs);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
static int __record_deleted_ref(int num, u64 dir, int index,
|
|
|
|
struct fs_path *name,
|
|
|
|
void *ctx)
|
|
|
|
{
|
|
|
|
struct send_ctx *sctx = ctx;
|
2014-03-03 21:31:03 +08:00
|
|
|
return record_ref(sctx->parent_root, num, dir, index, name,
|
|
|
|
ctx, &sctx->deleted_refs);
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int record_new_ref(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
|
|
|
|
sctx->cmp_key, 0, __record_new_ref, sctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int record_deleted_ref(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
|
|
|
|
sctx->cmp_key, 0, __record_deleted_ref, sctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct find_ref_ctx {
|
|
|
|
u64 dir;
|
2013-08-17 04:52:55 +08:00
|
|
|
u64 dir_gen;
|
|
|
|
struct btrfs_root *root;
|
2012-07-26 05:19:24 +08:00
|
|
|
struct fs_path *name;
|
|
|
|
int found_idx;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __find_iref(int num, u64 dir, int index,
|
|
|
|
struct fs_path *name,
|
|
|
|
void *ctx_)
|
|
|
|
{
|
|
|
|
struct find_ref_ctx *ctx = ctx_;
|
2013-08-17 04:52:55 +08:00
|
|
|
u64 dir_gen;
|
|
|
|
int ret;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
|
|
|
|
strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
|
2013-08-17 04:52:55 +08:00
|
|
|
/*
|
|
|
|
* To avoid doing extra lookups we'll only do this if everything
|
|
|
|
* else matches.
|
|
|
|
*/
|
|
|
|
ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
|
|
|
|
NULL, NULL, NULL);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
if (dir_gen != ctx->dir_gen)
|
|
|
|
return 0;
|
2012-07-26 05:19:24 +08:00
|
|
|
ctx->found_idx = num;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
static int find_iref(struct btrfs_root *root,
|
2012-07-26 05:19:24 +08:00
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_key *key,
|
2013-08-17 04:52:55 +08:00
|
|
|
u64 dir, u64 dir_gen, struct fs_path *name)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct find_ref_ctx ctx;
|
|
|
|
|
|
|
|
ctx.dir = dir;
|
|
|
|
ctx.name = name;
|
2013-08-17 04:52:55 +08:00
|
|
|
ctx.dir_gen = dir_gen;
|
2012-07-26 05:19:24 +08:00
|
|
|
ctx.found_idx = -1;
|
2013-08-17 04:52:55 +08:00
|
|
|
ctx.root = root;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (ctx.found_idx == -1)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
return ctx.found_idx;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __record_changed_new_ref(int num, u64 dir, int index,
|
|
|
|
struct fs_path *name,
|
|
|
|
void *ctx)
|
|
|
|
{
|
2013-08-17 04:52:55 +08:00
|
|
|
u64 dir_gen;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret;
|
|
|
|
struct send_ctx *sctx = ctx;
|
|
|
|
|
2013-08-17 04:52:55 +08:00
|
|
|
ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
|
|
|
|
NULL, NULL, NULL);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = find_iref(sctx->parent_root, sctx->right_path,
|
2013-08-17 04:52:55 +08:00
|
|
|
sctx->cmp_key, dir, dir_gen, name);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret == -ENOENT)
|
|
|
|
ret = __record_new_ref(num, dir, index, name, sctx);
|
|
|
|
else if (ret > 0)
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __record_changed_deleted_ref(int num, u64 dir, int index,
|
|
|
|
struct fs_path *name,
|
|
|
|
void *ctx)
|
|
|
|
{
|
2013-08-17 04:52:55 +08:00
|
|
|
u64 dir_gen;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret;
|
|
|
|
struct send_ctx *sctx = ctx;
|
|
|
|
|
2013-08-17 04:52:55 +08:00
|
|
|
ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
|
|
|
|
NULL, NULL, NULL);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
|
2013-08-17 04:52:55 +08:00
|
|
|
dir, dir_gen, name);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret == -ENOENT)
|
|
|
|
ret = __record_deleted_ref(num, dir, index, name, sctx);
|
|
|
|
else if (ret > 0)
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int record_changed_ref(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->cmp_key, 0, __record_changed_new_ref, sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Record and process all refs at once. Needed when an inode changes the
|
|
|
|
* generation number, which means that it was deleted and recreated.
|
|
|
|
*/
|
|
|
|
static int process_all_refs(struct send_ctx *sctx,
|
|
|
|
enum btrfs_compare_tree_result cmd)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
int slot;
|
|
|
|
iterate_inode_ref_t cb;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
int pending_move = 0;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (cmd == BTRFS_COMPARE_TREE_NEW) {
|
|
|
|
root = sctx->send_root;
|
|
|
|
cb = __record_new_ref;
|
|
|
|
} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
|
|
|
|
root = sctx->parent_root;
|
|
|
|
cb = __record_deleted_ref;
|
|
|
|
} else {
|
2014-02-04 02:24:19 +08:00
|
|
|
btrfs_err(sctx->send_root->fs_info,
|
|
|
|
"Wrong command %d in process_all_refs", cmd);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = sctx->cmp_key->objectid;
|
|
|
|
key.type = BTRFS_INODE_REF_KEY;
|
|
|
|
key.offset = 0;
|
2014-02-06 00:48:56 +08:00
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2014-02-06 00:48:56 +08:00
|
|
|
while (1) {
|
2012-07-26 05:19:24 +08:00
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
2014-02-06 00:48:56 +08:00
|
|
|
if (slot >= btrfs_header_nritems(eb)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
else if (ret > 0)
|
|
|
|
break;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
btrfs_item_key_to_cpu(eb, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.objectid != key.objectid ||
|
2012-10-15 16:30:45 +08:00
|
|
|
(found_key.type != BTRFS_INODE_REF_KEY &&
|
|
|
|
found_key.type != BTRFS_INODE_EXTREF_KEY))
|
2012-07-26 05:19:24 +08:00
|
|
|
break;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2014-02-06 00:48:56 +08:00
|
|
|
path->slots[0]++;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
2012-07-28 22:33:49 +08:00
|
|
|
btrfs_release_path(path);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2016-08-24 23:57:52 +08:00
|
|
|
/*
|
|
|
|
* We don't actually care about pending_move as we are simply
|
|
|
|
* re-creating this inode and will be rename'ing it into place once we
|
|
|
|
* rename the parent directory.
|
|
|
|
*/
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
ret = process_recorded_refs(sctx, &pending_move);
|
2012-07-26 05:19:24 +08:00
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int send_set_xattr(struct send_ctx *sctx,
|
|
|
|
struct fs_path *path,
|
|
|
|
const char *name, int name_len,
|
|
|
|
const char *data, int data_len)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
|
|
|
|
TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
|
|
|
|
TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int send_remove_xattr(struct send_ctx *sctx,
|
|
|
|
struct fs_path *path,
|
|
|
|
const char *name, int name_len)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
|
|
|
|
TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __process_new_xattr(int num, struct btrfs_key *di_key,
|
|
|
|
const char *name, int name_len,
|
|
|
|
const char *data, int data_len,
|
|
|
|
u8 type, void *ctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct send_ctx *sctx = ctx;
|
|
|
|
struct fs_path *p;
|
2016-09-27 19:03:22 +08:00
|
|
|
struct posix_acl_xattr_header dummy_acl;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/*
|
2016-05-20 09:18:45 +08:00
|
|
|
* This hack is needed because empty acls are stored as zero byte
|
2012-07-26 05:19:24 +08:00
|
|
|
* data in xattrs. Problem with that is, that receiving these zero byte
|
2016-05-20 09:18:45 +08:00
|
|
|
* acls will fail later. To fix this, we send a dummy acl list that
|
2012-07-26 05:19:24 +08:00
|
|
|
* only contains the version number and no entries.
|
|
|
|
*/
|
|
|
|
if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
|
|
|
|
!strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
|
|
|
|
if (data_len == 0) {
|
|
|
|
dummy_acl.a_version =
|
|
|
|
cpu_to_le32(POSIX_ACL_XATTR_VERSION);
|
|
|
|
data = (char *)&dummy_acl;
|
|
|
|
data_len = sizeof(dummy_acl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
|
|
|
|
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
|
|
|
|
const char *name, int name_len,
|
|
|
|
const char *data, int data_len,
|
|
|
|
u8 type, void *ctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct send_ctx *sctx = ctx;
|
|
|
|
struct fs_path *p;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = send_remove_xattr(sctx, p, name, name_len);
|
|
|
|
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int process_new_xattr(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_dir_item(sctx->send_root, sctx->left_path,
|
|
|
|
sctx->cmp_key, __process_new_xattr, sctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int process_deleted_xattr(struct send_ctx *sctx)
|
|
|
|
{
|
2016-09-13 03:35:52 +08:00
|
|
|
return iterate_dir_item(sctx->parent_root, sctx->right_path,
|
|
|
|
sctx->cmp_key, __process_deleted_xattr, sctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
struct find_xattr_ctx {
|
|
|
|
const char *name;
|
|
|
|
int name_len;
|
|
|
|
int found_idx;
|
|
|
|
char *found_data;
|
|
|
|
int found_data_len;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __find_xattr(int num, struct btrfs_key *di_key,
|
|
|
|
const char *name, int name_len,
|
|
|
|
const char *data, int data_len,
|
|
|
|
u8 type, void *vctx)
|
|
|
|
{
|
|
|
|
struct find_xattr_ctx *ctx = vctx;
|
|
|
|
|
|
|
|
if (name_len == ctx->name_len &&
|
|
|
|
strncmp(name, ctx->name, name_len) == 0) {
|
|
|
|
ctx->found_idx = num;
|
|
|
|
ctx->found_data_len = data_len;
|
2016-01-19 01:42:13 +08:00
|
|
|
ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!ctx->found_data)
|
|
|
|
return -ENOMEM;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
static int find_xattr(struct btrfs_root *root,
|
2012-07-26 05:19:24 +08:00
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_key *key,
|
|
|
|
const char *name, int name_len,
|
|
|
|
char **data, int *data_len)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct find_xattr_ctx ctx;
|
|
|
|
|
|
|
|
ctx.name = name;
|
|
|
|
ctx.name_len = name_len;
|
|
|
|
ctx.found_idx = -1;
|
|
|
|
ctx.found_data = NULL;
|
|
|
|
ctx.found_data_len = 0;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (ctx.found_idx == -1)
|
|
|
|
return -ENOENT;
|
|
|
|
if (data) {
|
|
|
|
*data = ctx.found_data;
|
|
|
|
*data_len = ctx.found_data_len;
|
|
|
|
} else {
|
|
|
|
kfree(ctx.found_data);
|
|
|
|
}
|
|
|
|
return ctx.found_idx;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
|
|
|
|
const char *name, int name_len,
|
|
|
|
const char *data, int data_len,
|
|
|
|
u8 type, void *ctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct send_ctx *sctx = ctx;
|
|
|
|
char *found_data = NULL;
|
|
|
|
int found_data_len = 0;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = find_xattr(sctx->parent_root, sctx->right_path,
|
|
|
|
sctx->cmp_key, name, name_len, &found_data,
|
|
|
|
&found_data_len);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret == -ENOENT) {
|
|
|
|
ret = __process_new_xattr(num, di_key, name, name_len, data,
|
|
|
|
data_len, type, ctx);
|
|
|
|
} else if (ret >= 0) {
|
|
|
|
if (data_len != found_data_len ||
|
|
|
|
memcmp(data, found_data, data_len)) {
|
|
|
|
ret = __process_new_xattr(num, di_key, name, name_len,
|
|
|
|
data, data_len, type, ctx);
|
|
|
|
} else {
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
kfree(found_data);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
|
|
|
|
const char *name, int name_len,
|
|
|
|
const char *data, int data_len,
|
|
|
|
u8 type, void *ctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct send_ctx *sctx = ctx;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
|
|
|
|
name, name_len, NULL, NULL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret == -ENOENT)
|
|
|
|
ret = __process_deleted_xattr(num, di_key, name, name_len, data,
|
|
|
|
data_len, type, ctx);
|
|
|
|
else if (ret >= 0)
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int process_changed_xattr(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_dir_item(sctx->send_root, sctx->left_path,
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->cmp_key, __process_changed_new_xattr, sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->cmp_key, __process_changed_deleted_xattr, sctx);
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int process_all_new_xattrs(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
int slot;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
root = sctx->send_root;
|
|
|
|
|
|
|
|
key.objectid = sctx->cmp_key->objectid;
|
|
|
|
key.type = BTRFS_XATTR_ITEM_KEY;
|
|
|
|
key.offset = 0;
|
2014-02-06 00:48:56 +08:00
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2014-02-06 00:48:56 +08:00
|
|
|
while (1) {
|
2012-07-26 05:19:24 +08:00
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
2014-02-06 00:48:56 +08:00
|
|
|
if (slot >= btrfs_header_nritems(eb)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
} else if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2014-02-06 00:48:56 +08:00
|
|
|
btrfs_item_key_to_cpu(eb, &found_key, slot);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (found_key.objectid != key.objectid ||
|
|
|
|
found_key.type != key.type) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = iterate_dir_item(root, path, &found_key,
|
|
|
|
__process_new_xattr, sctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2014-02-06 00:48:56 +08:00
|
|
|
path->slots[0]++;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-10-25 23:36:01 +08:00
|
|
|
static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = sctx->send_root;
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
struct inode *inode;
|
|
|
|
struct page *page;
|
|
|
|
char *addr;
|
|
|
|
struct btrfs_key key;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
pgoff_t index = offset >> PAGE_SHIFT;
|
2013-10-25 23:36:01 +08:00
|
|
|
pgoff_t last_index;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
unsigned pg_offset = offset & ~PAGE_MASK;
|
2013-10-25 23:36:01 +08:00
|
|
|
ssize_t ret = 0;
|
|
|
|
|
|
|
|
key.objectid = sctx->cur_ino;
|
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
inode = btrfs_iget(fs_info->sb, &key, root, NULL);
|
|
|
|
if (IS_ERR(inode))
|
|
|
|
return PTR_ERR(inode);
|
|
|
|
|
|
|
|
if (offset + len > i_size_read(inode)) {
|
|
|
|
if (offset > i_size_read(inode))
|
|
|
|
len = 0;
|
|
|
|
else
|
|
|
|
len = offset - i_size_read(inode);
|
|
|
|
}
|
|
|
|
if (len == 0)
|
|
|
|
goto out;
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
last_index = (offset + len - 1) >> PAGE_SHIFT;
|
2014-03-05 10:07:35 +08:00
|
|
|
|
|
|
|
/* initial readahead */
|
|
|
|
memset(&sctx->ra, 0, sizeof(struct file_ra_state));
|
|
|
|
file_ra_state_init(&sctx->ra, inode->i_mapping);
|
|
|
|
btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
|
|
|
|
last_index - index + 1);
|
|
|
|
|
2013-10-25 23:36:01 +08:00
|
|
|
while (index <= last_index) {
|
|
|
|
unsigned cur_len = min_t(unsigned, len,
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
PAGE_SIZE - pg_offset);
|
2016-01-19 01:42:13 +08:00
|
|
|
page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
|
2013-10-25 23:36:01 +08:00
|
|
|
if (!page) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!PageUptodate(page)) {
|
|
|
|
btrfs_readpage(NULL, page);
|
|
|
|
lock_page(page);
|
|
|
|
if (!PageUptodate(page)) {
|
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
put_page(page);
|
2013-10-25 23:36:01 +08:00
|
|
|
ret = -EIO;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
addr = kmap(page);
|
|
|
|
memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
|
|
|
|
kunmap(page);
|
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
put_page(page);
|
2013-10-25 23:36:01 +08:00
|
|
|
index++;
|
|
|
|
pg_offset = 0;
|
|
|
|
len -= cur_len;
|
|
|
|
ret += cur_len;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
iput(inode);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
/*
|
|
|
|
* Read some bytes from the current inode/file and send a write command to
|
|
|
|
* user space.
|
|
|
|
*/
|
|
|
|
static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
|
|
|
|
{
|
2016-09-20 22:05:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
int ret = 0;
|
|
|
|
struct fs_path *p;
|
2013-10-25 23:36:01 +08:00
|
|
|
ssize_t num_read = 0;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-10-25 23:36:01 +08:00
|
|
|
num_read = fill_read_buf(sctx, offset, len);
|
|
|
|
if (num_read <= 0) {
|
|
|
|
if (num_read < 0)
|
|
|
|
ret = num_read;
|
2012-07-26 05:19:24 +08:00
|
|
|
goto out;
|
2013-10-25 23:36:01 +08:00
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
|
2012-07-28 22:33:49 +08:00
|
|
|
TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2012-07-28 22:33:49 +08:00
|
|
|
return num_read;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Send a clone command to user space.
|
|
|
|
*/
|
|
|
|
static int send_clone(struct send_ctx *sctx,
|
|
|
|
u64 offset, u32 len,
|
|
|
|
struct clone_root *clone_root)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct fs_path *p;
|
|
|
|
u64 gen;
|
|
|
|
|
2016-09-20 22:05:03 +08:00
|
|
|
btrfs_debug(sctx->send_root->fs_info,
|
|
|
|
"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
|
|
|
|
offset, len, clone_root->root->objectid, clone_root->ino,
|
|
|
|
clone_root->offset);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
|
|
|
|
|
2012-07-28 22:33:49 +08:00
|
|
|
if (clone_root->root == sctx->send_root) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
|
2012-07-27 05:39:10 +08:00
|
|
|
&gen, NULL, NULL, NULL, NULL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = get_cur_path(sctx, clone_root->ino, gen, p);
|
|
|
|
} else {
|
2013-05-08 15:51:52 +08:00
|
|
|
ret = get_inode_path(clone_root->root, clone_root->ino, p);
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2015-06-05 05:17:25 +08:00
|
|
|
/*
|
|
|
|
* If the parent we're using has a received_uuid set then use that as
|
|
|
|
* our clone source as that is what we will look for when doing a
|
|
|
|
* receive.
|
|
|
|
*
|
|
|
|
* This covers the case that we create a snapshot off of a received
|
|
|
|
* subvolume and then use that as the parent and try to receive on a
|
|
|
|
* different host.
|
|
|
|
*/
|
|
|
|
if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
|
|
|
|
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
|
|
|
|
clone_root->root->root_item.received_uuid);
|
|
|
|
else
|
|
|
|
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
|
|
|
|
clone_root->root->root_item.uuid);
|
2012-07-26 05:19:24 +08:00
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
|
2013-12-03 23:55:48 +08:00
|
|
|
le64_to_cpu(clone_root->root->root_item.ctransid));
|
2012-07-26 05:19:24 +08:00
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
|
|
|
|
clone_root->offset);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2012-07-26 05:19:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-02-05 04:54:57 +08:00
|
|
|
/*
|
|
|
|
* Send an update extent command to user space.
|
|
|
|
*/
|
|
|
|
static int send_update_extent(struct send_ctx *sctx,
|
|
|
|
u64 offset, u32 len)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct fs_path *p;
|
|
|
|
|
2013-05-08 15:51:52 +08:00
|
|
|
p = fs_path_alloc();
|
2013-02-05 04:54:57 +08:00
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
|
|
|
|
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
|
|
|
|
tlv_put_failure:
|
|
|
|
out:
|
2013-05-08 15:51:52 +08:00
|
|
|
fs_path_free(p);
|
2013-02-05 04:54:57 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-10-23 00:18:51 +08:00
|
|
|
static int send_hole(struct send_ctx *sctx, u64 end)
|
|
|
|
{
|
|
|
|
struct fs_path *p = NULL;
|
|
|
|
u64 offset = sctx->cur_inode_last_extent;
|
|
|
|
u64 len;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
p = fs_path_alloc();
|
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
2014-03-31 21:52:14 +08:00
|
|
|
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto tlv_put_failure;
|
2013-10-23 00:18:51 +08:00
|
|
|
memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
|
|
|
|
while (offset < end) {
|
|
|
|
len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
|
|
|
|
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
|
|
|
|
if (ret < 0)
|
|
|
|
break;
|
|
|
|
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
|
|
|
|
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
|
|
|
|
TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
break;
|
|
|
|
offset += len;
|
|
|
|
}
|
|
|
|
tlv_put_failure:
|
|
|
|
fs_path_free(p);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: send, fix file corruption due to incorrect cloning operations
If we have a file that shares an extent with other files, when processing
the extent item relative to a shared extent, we blindly issue a clone
operation that will target a length matching the length in the extent item
and uses as a source some other file the receiver already has and points
to the same extent. However that range in the other file might not
exclusively point only to the shared extent, and so using that length
will result in the receiver getting a file with different data from the
one in the send snapshot. This issue happened both for incremental and
full send operations.
So fix this by issuing clone operations with lengths that don't cover
regions of the source file that point to different extents (or have holes).
The following test case for fstests reproduces the problem.
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -fr $send_files_dir
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_need_to_be_root
_require_cp_reflink
_require_xfs_io_command "fpunch"
send_files_dir=$TEST_DIR/btrfs-test-$seq
rm -f $seqres.full
rm -fr $send_files_dir
mkdir $send_files_dir
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount
# Create our test file with a single 100K extent.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Clone our file into a new file named bar.
cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now overwrite parts of our foo file.
$XFS_IO_PROG -c "pwrite -S 0xbb 50K 10K" \
-c "pwrite -S 0xcc 90K 10K" \
-c "fpunch 70K 10k" \
$SCRATCH_MNT/foo | _filter_xfs_io
_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
$SCRATCH_MNT/snap
echo "File digests in the original filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
_run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap
# Now recreate the filesystem by receiving the send stream and verify
# we get the same file contents that the original filesystem had.
_scratch_unmount
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount
_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap
# We expect the destination filesystem to have exactly the same file
# data as the original filesystem.
# The btrfs send implementation had a bug where it sent a clone
# operation from file foo into file bar covering the whole [0, 100K[
# range after creating and writing the file foo. This was incorrect
# because the file bar now included the updates done to file foo after
# we cloned foo to bar, breaking the COW nature of reflink copies
# (cloned extents).
echo "File digests in the new filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
status=0
exit
Another test case that reproduces the problem when we have compressed
extents:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -fr $send_files_dir
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_need_to_be_root
_require_cp_reflink
send_files_dir=$TEST_DIR/btrfs-test-$seq
rm -f $seqres.full
rm -fr $send_files_dir
mkdir $send_files_dir
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our file with an extent of 100K starting at file offset 0K.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K" \
-c "fsync" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Rewrite part of the previous extent (its first 40K) and write a new
# 100K extent starting at file offset 100K.
$XFS_IO_PROG -c "pwrite -S 0xbb 0K 40K" \
-c "pwrite -S 0xcc 100K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Our file foo now has 3 file extent items in its metadata:
#
# 1) One covering the file range 0 to 40K;
# 2) One covering the file range 40K to 100K, which points to the first
# extent we wrote to the file and has a data offset field with value
# 40K (our file no longer uses the first 40K of data from that
# extent);
# 3) One covering the file range 100K to 200K.
# Now clone our file foo into file bar.
cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Create our snapshot for the send operation.
_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
$SCRATCH_MNT/snap
echo "File digests in the original filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
_run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap
# Now recreate the filesystem by receiving the send stream and verify we
# get the same file contents that the original filesystem had.
# Btrfs send used to issue a clone operation from foo's range
# [80K, 140K[ to bar's range [40K, 100K[ when cloning the extent pointed
# to by foo's second file extent item, this was incorrect because of bad
# accounting of the file extent item's data offset field. The correct
# range to clone from should have been [40K, 100K[.
_scratch_unmount
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap
echo "File digests in the new filesystem:"
# Must match the digests we got in the original filesystem.
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
status=0
exit
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-02 17:47:34 +08:00
|
|
|
static int send_extent_data(struct send_ctx *sctx,
|
|
|
|
const u64 offset,
|
|
|
|
const u64 len)
|
|
|
|
{
|
|
|
|
u64 sent = 0;
|
|
|
|
|
|
|
|
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
|
|
|
|
return send_update_extent(sctx, offset, len);
|
|
|
|
|
|
|
|
while (sent < len) {
|
|
|
|
u64 size = len - sent;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (size > BTRFS_SEND_READ_SIZE)
|
|
|
|
size = BTRFS_SEND_READ_SIZE;
|
|
|
|
ret = send_write(sctx, offset + sent, size);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
if (!ret)
|
|
|
|
break;
|
|
|
|
sent += ret;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int clone_range(struct send_ctx *sctx,
|
|
|
|
struct clone_root *clone_root,
|
|
|
|
const u64 disk_byte,
|
|
|
|
u64 data_offset,
|
|
|
|
u64 offset,
|
|
|
|
u64 len)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can't send a clone operation for the entire range if we find
|
|
|
|
* extent items in the respective range in the source file that
|
|
|
|
* refer to different extents or if we find holes.
|
|
|
|
* So check for that and do a mix of clone and regular write/copy
|
|
|
|
* operations if needed.
|
|
|
|
*
|
|
|
|
* Example:
|
|
|
|
*
|
|
|
|
* mkfs.btrfs -f /dev/sda
|
|
|
|
* mount /dev/sda /mnt
|
|
|
|
* xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
|
|
|
|
* cp --reflink=always /mnt/foo /mnt/bar
|
|
|
|
* xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
|
|
|
|
* btrfs subvolume snapshot -r /mnt /mnt/snap
|
|
|
|
*
|
|
|
|
* If when we send the snapshot and we are processing file bar (which
|
|
|
|
* has a higher inode number than foo) we blindly send a clone operation
|
|
|
|
* for the [0, 100K[ range from foo to bar, the receiver ends up getting
|
|
|
|
* a file bar that matches the content of file foo - iow, doesn't match
|
|
|
|
* the content from bar in the original filesystem.
|
|
|
|
*/
|
|
|
|
key.objectid = clone_root->ino;
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = clone_root->offset;
|
|
|
|
ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret > 0 && path->slots[0] > 0) {
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
|
|
|
|
if (key.objectid == clone_root->ino &&
|
|
|
|
key.type == BTRFS_EXTENT_DATA_KEY)
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
int slot = path->slots[0];
|
|
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
u8 type;
|
|
|
|
u64 ext_len;
|
|
|
|
u64 clone_len;
|
|
|
|
|
|
|
|
if (slot >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(clone_root->root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
else if (ret > 0)
|
|
|
|
break;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We might have an implicit trailing hole (NO_HOLES feature
|
|
|
|
* enabled). We deal with it after leaving this loop.
|
|
|
|
*/
|
|
|
|
if (key.objectid != clone_root->ino ||
|
|
|
|
key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
break;
|
|
|
|
|
|
|
|
ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
|
|
|
|
type = btrfs_file_extent_type(leaf, ei);
|
|
|
|
if (type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
ext_len = PAGE_ALIGN(ext_len);
|
Btrfs: send, fix file corruption due to incorrect cloning operations
If we have a file that shares an extent with other files, when processing
the extent item relative to a shared extent, we blindly issue a clone
operation that will target a length matching the length in the extent item
and uses as a source some other file the receiver already has and points
to the same extent. However that range in the other file might not
exclusively point only to the shared extent, and so using that length
will result in the receiver getting a file with different data from the
one in the send snapshot. This issue happened both for incremental and
full send operations.
So fix this by issuing clone operations with lengths that don't cover
regions of the source file that point to different extents (or have holes).
The following test case for fstests reproduces the problem.
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -fr $send_files_dir
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_need_to_be_root
_require_cp_reflink
_require_xfs_io_command "fpunch"
send_files_dir=$TEST_DIR/btrfs-test-$seq
rm -f $seqres.full
rm -fr $send_files_dir
mkdir $send_files_dir
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount
# Create our test file with a single 100K extent.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Clone our file into a new file named bar.
cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now overwrite parts of our foo file.
$XFS_IO_PROG -c "pwrite -S 0xbb 50K 10K" \
-c "pwrite -S 0xcc 90K 10K" \
-c "fpunch 70K 10k" \
$SCRATCH_MNT/foo | _filter_xfs_io
_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
$SCRATCH_MNT/snap
echo "File digests in the original filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
_run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap
# Now recreate the filesystem by receiving the send stream and verify
# we get the same file contents that the original filesystem had.
_scratch_unmount
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount
_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap
# We expect the destination filesystem to have exactly the same file
# data as the original filesystem.
# The btrfs send implementation had a bug where it sent a clone
# operation from file foo into file bar covering the whole [0, 100K[
# range after creating and writing the file foo. This was incorrect
# because the file bar now included the updates done to file foo after
# we cloned foo to bar, breaking the COW nature of reflink copies
# (cloned extents).
echo "File digests in the new filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
status=0
exit
Another test case that reproduces the problem when we have compressed
extents:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -fr $send_files_dir
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_need_to_be_root
_require_cp_reflink
send_files_dir=$TEST_DIR/btrfs-test-$seq
rm -f $seqres.full
rm -fr $send_files_dir
mkdir $send_files_dir
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our file with an extent of 100K starting at file offset 0K.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K" \
-c "fsync" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Rewrite part of the previous extent (its first 40K) and write a new
# 100K extent starting at file offset 100K.
$XFS_IO_PROG -c "pwrite -S 0xbb 0K 40K" \
-c "pwrite -S 0xcc 100K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Our file foo now has 3 file extent items in its metadata:
#
# 1) One covering the file range 0 to 40K;
# 2) One covering the file range 40K to 100K, which points to the first
# extent we wrote to the file and has a data offset field with value
# 40K (our file no longer uses the first 40K of data from that
# extent);
# 3) One covering the file range 100K to 200K.
# Now clone our file foo into file bar.
cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Create our snapshot for the send operation.
_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
$SCRATCH_MNT/snap
echo "File digests in the original filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
_run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap
# Now recreate the filesystem by receiving the send stream and verify we
# get the same file contents that the original filesystem had.
# Btrfs send used to issue a clone operation from foo's range
# [80K, 140K[ to bar's range [40K, 100K[ when cloning the extent pointed
# to by foo's second file extent item, this was incorrect because of bad
# accounting of the file extent item's data offset field. The correct
# range to clone from should have been [40K, 100K[.
_scratch_unmount
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap
echo "File digests in the new filesystem:"
# Must match the digests we got in the original filesystem.
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
status=0
exit
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-02 17:47:34 +08:00
|
|
|
} else {
|
|
|
|
ext_len = btrfs_file_extent_num_bytes(leaf, ei);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (key.offset + ext_len <= clone_root->offset)
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
if (key.offset > clone_root->offset) {
|
|
|
|
/* Implicit hole, NO_HOLES feature enabled. */
|
|
|
|
u64 hole_len = key.offset - clone_root->offset;
|
|
|
|
|
|
|
|
if (hole_len > len)
|
|
|
|
hole_len = len;
|
|
|
|
ret = send_extent_data(sctx, offset, hole_len);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
len -= hole_len;
|
|
|
|
if (len == 0)
|
|
|
|
break;
|
|
|
|
offset += hole_len;
|
|
|
|
clone_root->offset += hole_len;
|
|
|
|
data_offset += hole_len;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (key.offset >= clone_root->offset + len)
|
|
|
|
break;
|
|
|
|
|
|
|
|
clone_len = min_t(u64, ext_len, len);
|
|
|
|
|
|
|
|
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
|
|
|
|
btrfs_file_extent_offset(leaf, ei) == data_offset)
|
|
|
|
ret = send_clone(sctx, offset, clone_len, clone_root);
|
|
|
|
else
|
|
|
|
ret = send_extent_data(sctx, offset, clone_len);
|
|
|
|
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
len -= clone_len;
|
|
|
|
if (len == 0)
|
|
|
|
break;
|
|
|
|
offset += clone_len;
|
|
|
|
clone_root->offset += clone_len;
|
|
|
|
data_offset += clone_len;
|
|
|
|
next:
|
|
|
|
path->slots[0]++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (len > 0)
|
|
|
|
ret = send_extent_data(sctx, offset, len);
|
|
|
|
else
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
static int send_write_or_clone(struct send_ctx *sctx,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_key *key,
|
|
|
|
struct clone_root *clone_root)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
u64 offset = key->offset;
|
|
|
|
u64 len;
|
|
|
|
u8 type;
|
2014-01-12 10:26:28 +08:00
|
|
|
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
type = btrfs_file_extent_type(path->nodes[0], ei);
|
2012-08-08 04:25:13 +08:00
|
|
|
if (type == BTRFS_FILE_EXTENT_INLINE) {
|
2014-01-04 13:07:00 +08:00
|
|
|
len = btrfs_file_extent_inline_len(path->nodes[0],
|
|
|
|
path->slots[0], ei);
|
2012-08-08 04:25:13 +08:00
|
|
|
/*
|
|
|
|
* it is possible the inline item won't cover the whole page,
|
|
|
|
* but there may be items after this page. Make
|
|
|
|
* sure to send the whole thing
|
|
|
|
*/
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
len = PAGE_ALIGN(len);
|
2012-08-08 04:25:13 +08:00
|
|
|
} else {
|
2012-07-26 05:19:24 +08:00
|
|
|
len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
|
2012-08-08 04:25:13 +08:00
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
if (offset + len > sctx->cur_inode_size)
|
|
|
|
len = sctx->cur_inode_size - offset;
|
|
|
|
if (len == 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2014-01-12 10:26:28 +08:00
|
|
|
if (clone_root && IS_ALIGNED(offset + len, bs)) {
|
Btrfs: send, fix file corruption due to incorrect cloning operations
If we have a file that shares an extent with other files, when processing
the extent item relative to a shared extent, we blindly issue a clone
operation that will target a length matching the length in the extent item
and uses as a source some other file the receiver already has and points
to the same extent. However that range in the other file might not
exclusively point only to the shared extent, and so using that length
will result in the receiver getting a file with different data from the
one in the send snapshot. This issue happened both for incremental and
full send operations.
So fix this by issuing clone operations with lengths that don't cover
regions of the source file that point to different extents (or have holes).
The following test case for fstests reproduces the problem.
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -fr $send_files_dir
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_need_to_be_root
_require_cp_reflink
_require_xfs_io_command "fpunch"
send_files_dir=$TEST_DIR/btrfs-test-$seq
rm -f $seqres.full
rm -fr $send_files_dir
mkdir $send_files_dir
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount
# Create our test file with a single 100K extent.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Clone our file into a new file named bar.
cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now overwrite parts of our foo file.
$XFS_IO_PROG -c "pwrite -S 0xbb 50K 10K" \
-c "pwrite -S 0xcc 90K 10K" \
-c "fpunch 70K 10k" \
$SCRATCH_MNT/foo | _filter_xfs_io
_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
$SCRATCH_MNT/snap
echo "File digests in the original filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
_run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap
# Now recreate the filesystem by receiving the send stream and verify
# we get the same file contents that the original filesystem had.
_scratch_unmount
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount
_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap
# We expect the destination filesystem to have exactly the same file
# data as the original filesystem.
# The btrfs send implementation had a bug where it sent a clone
# operation from file foo into file bar covering the whole [0, 100K[
# range after creating and writing the file foo. This was incorrect
# because the file bar now included the updates done to file foo after
# we cloned foo to bar, breaking the COW nature of reflink copies
# (cloned extents).
echo "File digests in the new filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
status=0
exit
Another test case that reproduces the problem when we have compressed
extents:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -fr $send_files_dir
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_need_to_be_root
_require_cp_reflink
send_files_dir=$TEST_DIR/btrfs-test-$seq
rm -f $seqres.full
rm -fr $send_files_dir
mkdir $send_files_dir
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our file with an extent of 100K starting at file offset 0K.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K" \
-c "fsync" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Rewrite part of the previous extent (its first 40K) and write a new
# 100K extent starting at file offset 100K.
$XFS_IO_PROG -c "pwrite -S 0xbb 0K 40K" \
-c "pwrite -S 0xcc 100K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Our file foo now has 3 file extent items in its metadata:
#
# 1) One covering the file range 0 to 40K;
# 2) One covering the file range 40K to 100K, which points to the first
# extent we wrote to the file and has a data offset field with value
# 40K (our file no longer uses the first 40K of data from that
# extent);
# 3) One covering the file range 100K to 200K.
# Now clone our file foo into file bar.
cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Create our snapshot for the send operation.
_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
$SCRATCH_MNT/snap
echo "File digests in the original filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
_run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap
# Now recreate the filesystem by receiving the send stream and verify we
# get the same file contents that the original filesystem had.
# Btrfs send used to issue a clone operation from foo's range
# [80K, 140K[ to bar's range [40K, 100K[ when cloning the extent pointed
# to by foo's second file extent item, this was incorrect because of bad
# accounting of the file extent item's data offset field. The correct
# range to clone from should have been [40K, 100K[.
_scratch_unmount
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap
echo "File digests in the new filesystem:"
# Must match the digests we got in the original filesystem.
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
status=0
exit
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-02 17:47:34 +08:00
|
|
|
u64 disk_byte;
|
|
|
|
u64 data_offset;
|
|
|
|
|
|
|
|
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
|
|
|
|
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
|
|
|
|
ret = clone_range(sctx, clone_root, disk_byte, data_offset,
|
|
|
|
offset, len);
|
2013-02-05 04:54:57 +08:00
|
|
|
} else {
|
Btrfs: send, fix file corruption due to incorrect cloning operations
If we have a file that shares an extent with other files, when processing
the extent item relative to a shared extent, we blindly issue a clone
operation that will target a length matching the length in the extent item
and uses as a source some other file the receiver already has and points
to the same extent. However that range in the other file might not
exclusively point only to the shared extent, and so using that length
will result in the receiver getting a file with different data from the
one in the send snapshot. This issue happened both for incremental and
full send operations.
So fix this by issuing clone operations with lengths that don't cover
regions of the source file that point to different extents (or have holes).
The following test case for fstests reproduces the problem.
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -fr $send_files_dir
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_need_to_be_root
_require_cp_reflink
_require_xfs_io_command "fpunch"
send_files_dir=$TEST_DIR/btrfs-test-$seq
rm -f $seqres.full
rm -fr $send_files_dir
mkdir $send_files_dir
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount
# Create our test file with a single 100K extent.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Clone our file into a new file named bar.
cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now overwrite parts of our foo file.
$XFS_IO_PROG -c "pwrite -S 0xbb 50K 10K" \
-c "pwrite -S 0xcc 90K 10K" \
-c "fpunch 70K 10k" \
$SCRATCH_MNT/foo | _filter_xfs_io
_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
$SCRATCH_MNT/snap
echo "File digests in the original filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
_run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap
# Now recreate the filesystem by receiving the send stream and verify
# we get the same file contents that the original filesystem had.
_scratch_unmount
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount
_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap
# We expect the destination filesystem to have exactly the same file
# data as the original filesystem.
# The btrfs send implementation had a bug where it sent a clone
# operation from file foo into file bar covering the whole [0, 100K[
# range after creating and writing the file foo. This was incorrect
# because the file bar now included the updates done to file foo after
# we cloned foo to bar, breaking the COW nature of reflink copies
# (cloned extents).
echo "File digests in the new filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
status=0
exit
Another test case that reproduces the problem when we have compressed
extents:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -fr $send_files_dir
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_need_to_be_root
_require_cp_reflink
send_files_dir=$TEST_DIR/btrfs-test-$seq
rm -f $seqres.full
rm -fr $send_files_dir
mkdir $send_files_dir
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our file with an extent of 100K starting at file offset 0K.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K" \
-c "fsync" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Rewrite part of the previous extent (its first 40K) and write a new
# 100K extent starting at file offset 100K.
$XFS_IO_PROG -c "pwrite -S 0xbb 0K 40K" \
-c "pwrite -S 0xcc 100K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Our file foo now has 3 file extent items in its metadata:
#
# 1) One covering the file range 0 to 40K;
# 2) One covering the file range 40K to 100K, which points to the first
# extent we wrote to the file and has a data offset field with value
# 40K (our file no longer uses the first 40K of data from that
# extent);
# 3) One covering the file range 100K to 200K.
# Now clone our file foo into file bar.
cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Create our snapshot for the send operation.
_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
$SCRATCH_MNT/snap
echo "File digests in the original filesystem:"
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
_run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap
# Now recreate the filesystem by receiving the send stream and verify we
# get the same file contents that the original filesystem had.
# Btrfs send used to issue a clone operation from foo's range
# [80K, 140K[ to bar's range [40K, 100K[ when cloning the extent pointed
# to by foo's second file extent item, this was incorrect because of bad
# accounting of the file extent item's data offset field. The correct
# range to clone from should have been [40K, 100K[.
_scratch_unmount
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap
echo "File digests in the new filesystem:"
# Must match the digests we got in the original filesystem.
md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
md5sum $SCRATCH_MNT/snap/bar | _filter_scratch
status=0
exit
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-02 17:47:34 +08:00
|
|
|
ret = send_extent_data(sctx, offset, len);
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int is_extent_unchanged(struct send_ctx *sctx,
|
|
|
|
struct btrfs_path *left_path,
|
|
|
|
struct btrfs_key *ekey)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
int slot;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
u64 left_disknr;
|
|
|
|
u64 right_disknr;
|
|
|
|
u64 left_offset;
|
|
|
|
u64 right_offset;
|
|
|
|
u64 left_offset_fixed;
|
|
|
|
u64 left_len;
|
|
|
|
u64 right_len;
|
2012-08-08 04:25:13 +08:00
|
|
|
u64 left_gen;
|
|
|
|
u64 right_gen;
|
2012-07-26 05:19:24 +08:00
|
|
|
u8 left_type;
|
|
|
|
u8 right_type;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
eb = left_path->nodes[0];
|
|
|
|
slot = left_path->slots[0];
|
|
|
|
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
|
|
|
|
left_type = btrfs_file_extent_type(eb, ei);
|
|
|
|
|
|
|
|
if (left_type != BTRFS_FILE_EXTENT_REG) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-08-08 04:25:13 +08:00
|
|
|
left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
|
|
|
|
left_len = btrfs_file_extent_num_bytes(eb, ei);
|
|
|
|
left_offset = btrfs_file_extent_offset(eb, ei);
|
|
|
|
left_gen = btrfs_file_extent_generation(eb, ei);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Following comments will refer to these graphics. L is the left
|
|
|
|
* extents which we are checking at the moment. 1-8 are the right
|
|
|
|
* extents that we iterate.
|
|
|
|
*
|
|
|
|
* |-----L-----|
|
|
|
|
* |-1-|-2a-|-3-|-4-|-5-|-6-|
|
|
|
|
*
|
|
|
|
* |-----L-----|
|
|
|
|
* |--1--|-2b-|...(same as above)
|
|
|
|
*
|
|
|
|
* Alternative situation. Happens on files where extents got split.
|
|
|
|
* |-----L-----|
|
|
|
|
* |-----------7-----------|-6-|
|
|
|
|
*
|
|
|
|
* Alternative situation. Happens on files which got larger.
|
|
|
|
* |-----L-----|
|
|
|
|
* |-8-|
|
|
|
|
* Nothing follows after 8.
|
|
|
|
*/
|
|
|
|
|
|
|
|
key.objectid = ekey->objectid;
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = ekey->offset;
|
|
|
|
ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle special case where the right side has no extents at all.
|
|
|
|
*/
|
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
btrfs_item_key_to_cpu(eb, &found_key, slot);
|
|
|
|
if (found_key.objectid != key.objectid ||
|
|
|
|
found_key.type != key.type) {
|
2013-08-21 03:55:39 +08:00
|
|
|
/* If we're a hole then just pretend nothing changed */
|
|
|
|
ret = (left_disknr) ? 0 : 1;
|
2012-07-26 05:19:24 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're now on 2a, 2b or 7.
|
|
|
|
*/
|
|
|
|
key = found_key;
|
|
|
|
while (key.offset < ekey->offset + left_len) {
|
|
|
|
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
|
|
|
|
right_type = btrfs_file_extent_type(eb, ei);
|
|
|
|
if (right_type != BTRFS_FILE_EXTENT_REG) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2013-11-01 04:49:02 +08:00
|
|
|
right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
|
|
|
|
right_len = btrfs_file_extent_num_bytes(eb, ei);
|
|
|
|
right_offset = btrfs_file_extent_offset(eb, ei);
|
|
|
|
right_gen = btrfs_file_extent_generation(eb, ei);
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
/*
|
|
|
|
* Are we at extent 8? If yes, we know the extent is changed.
|
|
|
|
* This may only happen on the first iteration.
|
|
|
|
*/
|
2012-08-01 18:49:15 +08:00
|
|
|
if (found_key.offset + right_len <= ekey->offset) {
|
2013-08-21 03:55:39 +08:00
|
|
|
/* If we're a hole just pretend nothing changed */
|
|
|
|
ret = (left_disknr) ? 0 : 1;
|
2012-07-26 05:19:24 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
left_offset_fixed = left_offset;
|
|
|
|
if (key.offset < ekey->offset) {
|
|
|
|
/* Fix the right offset for 2a and 7. */
|
|
|
|
right_offset += ekey->offset - key.offset;
|
|
|
|
} else {
|
|
|
|
/* Fix the left offset for all behind 2a and 2b */
|
|
|
|
left_offset_fixed += key.offset - ekey->offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if we have the same extent.
|
|
|
|
*/
|
2012-08-01 18:46:05 +08:00
|
|
|
if (left_disknr != right_disknr ||
|
2012-08-08 04:25:13 +08:00
|
|
|
left_offset_fixed != right_offset ||
|
|
|
|
left_gen != right_gen) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Go to the next extent.
|
|
|
|
*/
|
|
|
|
ret = btrfs_next_item(sctx->parent_root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (!ret) {
|
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
btrfs_item_key_to_cpu(eb, &found_key, slot);
|
|
|
|
}
|
|
|
|
if (ret || found_key.objectid != key.objectid ||
|
|
|
|
found_key.type != key.type) {
|
|
|
|
key.offset += right_len;
|
|
|
|
break;
|
2013-03-21 22:30:23 +08:00
|
|
|
}
|
|
|
|
if (found_key.offset != key.offset + right_len) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
key = found_key;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're now behind the left extent (treat as unchanged) or at the end
|
|
|
|
* of the right side (treat as changed).
|
|
|
|
*/
|
|
|
|
if (key.offset >= ekey->offset + left_len)
|
|
|
|
ret = 1;
|
|
|
|
else
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-10-23 00:18:51 +08:00
|
|
|
static int get_last_extent(struct send_ctx *sctx, u64 offset)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_root *root = sctx->send_root;
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 extent_end;
|
|
|
|
u8 type;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
sctx->cur_inode_last_extent = 0;
|
|
|
|
|
|
|
|
key.objectid = sctx->cur_ino;
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = offset;
|
|
|
|
ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = 0;
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
|
|
|
|
if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
type = btrfs_file_extent_type(path->nodes[0], fi);
|
|
|
|
if (type == BTRFS_FILE_EXTENT_INLINE) {
|
2014-01-04 13:07:00 +08:00
|
|
|
u64 size = btrfs_file_extent_inline_len(path->nodes[0],
|
|
|
|
path->slots[0], fi);
|
2013-10-23 00:18:51 +08:00
|
|
|
extent_end = ALIGN(key.offset + size,
|
2016-06-15 21:22:56 +08:00
|
|
|
sctx->send_root->fs_info->sectorsize);
|
2013-10-23 00:18:51 +08:00
|
|
|
} else {
|
|
|
|
extent_end = key.offset +
|
|
|
|
btrfs_file_extent_num_bytes(path->nodes[0], fi);
|
|
|
|
}
|
|
|
|
sctx->cur_inode_last_extent = extent_end;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
|
|
|
|
struct btrfs_key *key)
|
|
|
|
{
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
u64 extent_end;
|
|
|
|
u8 type;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (sctx->cur_inode_last_extent == (u64)-1) {
|
|
|
|
ret = get_last_extent(sctx, key->offset - 1);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
type = btrfs_file_extent_type(path->nodes[0], fi);
|
|
|
|
if (type == BTRFS_FILE_EXTENT_INLINE) {
|
2014-01-04 13:07:00 +08:00
|
|
|
u64 size = btrfs_file_extent_inline_len(path->nodes[0],
|
|
|
|
path->slots[0], fi);
|
2013-10-23 00:18:51 +08:00
|
|
|
extent_end = ALIGN(key->offset + size,
|
2016-06-15 21:22:56 +08:00
|
|
|
sctx->send_root->fs_info->sectorsize);
|
2013-10-23 00:18:51 +08:00
|
|
|
} else {
|
|
|
|
extent_end = key->offset +
|
|
|
|
btrfs_file_extent_num_bytes(path->nodes[0], fi);
|
|
|
|
}
|
2014-01-28 09:38:06 +08:00
|
|
|
|
|
|
|
if (path->slots[0] == 0 &&
|
|
|
|
sctx->cur_inode_last_extent < key->offset) {
|
|
|
|
/*
|
|
|
|
* We might have skipped entire leafs that contained only
|
|
|
|
* file extent items for our current inode. These leafs have
|
|
|
|
* a generation number smaller (older) than the one in the
|
|
|
|
* current leaf and the leaf our last extent came from, and
|
|
|
|
* are located between these 2 leafs.
|
|
|
|
*/
|
|
|
|
ret = get_last_extent(sctx, key->offset - 1);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-10-23 00:18:51 +08:00
|
|
|
if (sctx->cur_inode_last_extent < key->offset)
|
|
|
|
ret = send_hole(sctx, key->offset);
|
|
|
|
sctx->cur_inode_last_extent = extent_end;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
static int process_extent(struct send_ctx *sctx,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_key *key)
|
|
|
|
{
|
|
|
|
struct clone_root *found_clone = NULL;
|
2013-08-21 03:55:39 +08:00
|
|
|
int ret = 0;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
if (S_ISLNK(sctx->cur_inode_mode))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (sctx->parent_root && !sctx->cur_inode_new) {
|
|
|
|
ret = is_extent_unchanged(sctx, path, key);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
ret = 0;
|
2013-10-23 00:18:51 +08:00
|
|
|
goto out_hole;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
2013-08-21 03:55:39 +08:00
|
|
|
} else {
|
|
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
u8 type;
|
|
|
|
|
|
|
|
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
type = btrfs_file_extent_type(path->nodes[0], ei);
|
|
|
|
if (type == BTRFS_FILE_EXTENT_PREALLOC ||
|
|
|
|
type == BTRFS_FILE_EXTENT_REG) {
|
|
|
|
/*
|
|
|
|
* The send spec does not have a prealloc command yet,
|
|
|
|
* so just leave a hole for prealloc'ed extents until
|
|
|
|
* we have enough commands queued up to justify rev'ing
|
|
|
|
* the send spec.
|
|
|
|
*/
|
|
|
|
if (type == BTRFS_FILE_EXTENT_PREALLOC) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Have a hole, just skip it. */
|
|
|
|
if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = find_extent_clone(sctx, path, key->objectid, key->offset,
|
|
|
|
sctx->cur_inode_size, &found_clone);
|
|
|
|
if (ret != -ENOENT && ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = send_write_or_clone(sctx, path, key, found_clone);
|
2013-10-23 00:18:51 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
out_hole:
|
|
|
|
ret = maybe_send_hole(sctx, path, key);
|
2012-07-26 05:19:24 +08:00
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int process_all_extents(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
int slot;
|
|
|
|
|
|
|
|
root = sctx->send_root;
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = sctx->cmp_key->objectid;
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = 0;
|
2014-01-25 01:42:09 +08:00
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2014-01-25 01:42:09 +08:00
|
|
|
while (1) {
|
2012-07-26 05:19:24 +08:00
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
2014-01-25 01:42:09 +08:00
|
|
|
|
|
|
|
if (slot >= btrfs_header_nritems(eb)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
} else if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
btrfs_item_key_to_cpu(eb, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.objectid != key.objectid ||
|
|
|
|
found_key.type != key.type) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = process_extent(sctx, path, &found_key);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2014-01-25 01:42:09 +08:00
|
|
|
path->slots[0]++;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
|
|
|
|
int *pending_move,
|
|
|
|
int *refs_processed)
|
2012-07-26 05:19:24 +08:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (sctx->cur_ino == 0)
|
|
|
|
goto out;
|
|
|
|
if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
|
2012-10-15 16:30:45 +08:00
|
|
|
sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
|
2012-07-26 05:19:24 +08:00
|
|
|
goto out;
|
|
|
|
if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
|
|
|
|
goto out;
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
ret = process_recorded_refs(sctx, pending_move);
|
2012-07-28 22:09:35 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
*refs_processed = 1;
|
2012-07-26 05:19:24 +08:00
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
u64 left_mode;
|
|
|
|
u64 left_uid;
|
|
|
|
u64 left_gid;
|
|
|
|
u64 right_mode;
|
|
|
|
u64 right_uid;
|
|
|
|
u64 right_gid;
|
|
|
|
int need_chmod = 0;
|
|
|
|
int need_chown = 0;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
int pending_move = 0;
|
|
|
|
int refs_processed = 0;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
|
|
|
|
&refs_processed);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
/*
|
|
|
|
* We have processed the refs and thus need to advance send_progress.
|
|
|
|
* Now, calls to get_cur_xxx will take the updated refs of the current
|
|
|
|
* inode into account.
|
|
|
|
*
|
|
|
|
* On the other hand, if our current inode is a directory and couldn't
|
|
|
|
* be moved/renamed because its parent was renamed/moved too and it has
|
|
|
|
* a higher inode number, we can only move/rename our current inode
|
|
|
|
* after we moved/renamed its parent. Therefore in this case operate on
|
|
|
|
* the old path (pre move/rename) of our current inode, and the
|
|
|
|
* move/rename will be performed later.
|
|
|
|
*/
|
|
|
|
if (refs_processed && !pending_move)
|
|
|
|
sctx->send_progress = sctx->cur_ino + 1;
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
|
|
|
|
goto out;
|
|
|
|
if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
|
2012-07-27 05:39:10 +08:00
|
|
|
&left_mode, &left_uid, &left_gid, NULL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2012-10-17 21:52:47 +08:00
|
|
|
if (!sctx->parent_root || sctx->cur_inode_new) {
|
|
|
|
need_chown = 1;
|
|
|
|
if (!S_ISLNK(sctx->cur_inode_mode))
|
2012-07-26 05:19:24 +08:00
|
|
|
need_chmod = 1;
|
2012-10-17 21:52:47 +08:00
|
|
|
} else {
|
|
|
|
ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
|
|
|
|
NULL, NULL, &right_mode, &right_uid,
|
|
|
|
&right_gid, NULL);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2012-10-17 21:52:47 +08:00
|
|
|
if (left_uid != right_uid || left_gid != right_gid)
|
|
|
|
need_chown = 1;
|
|
|
|
if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
|
|
|
|
need_chmod = 1;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (S_ISREG(sctx->cur_inode_mode)) {
|
2013-10-23 00:18:51 +08:00
|
|
|
if (need_send_hole(sctx)) {
|
2014-03-31 06:02:53 +08:00
|
|
|
if (sctx->cur_inode_last_extent == (u64)-1 ||
|
|
|
|
sctx->cur_inode_last_extent <
|
|
|
|
sctx->cur_inode_size) {
|
2013-10-23 00:18:51 +08:00
|
|
|
ret = get_last_extent(sctx, (u64)-1);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (sctx->cur_inode_last_extent <
|
|
|
|
sctx->cur_inode_size) {
|
|
|
|
ret = send_hole(sctx, sctx->cur_inode_size);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
|
|
|
|
sctx->cur_inode_size);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (need_chown) {
|
|
|
|
ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
|
|
|
|
left_uid, left_gid);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (need_chmod) {
|
|
|
|
ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
|
|
|
|
left_mode);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
* If other directory inodes depended on our current directory
|
|
|
|
* inode's move/rename, now do their move/rename operations.
|
2012-07-26 05:19:24 +08:00
|
|
|
*/
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
|
|
|
|
ret = apply_children_dir_moves(sctx);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
2014-03-03 20:28:40 +08:00
|
|
|
/*
|
|
|
|
* Need to send that every time, no matter if it actually
|
|
|
|
* changed between the two trees as we have done changes to
|
|
|
|
* the inode before. If our inode is a directory and it's
|
|
|
|
* waiting to be moved/renamed, we will send its utimes when
|
|
|
|
* it's moved/renamed, therefore we don't need to do it here.
|
|
|
|
*/
|
|
|
|
sctx->send_progress = sctx->cur_ino + 1;
|
|
|
|
ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int changed_inode(struct send_ctx *sctx,
|
|
|
|
enum btrfs_compare_tree_result result)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_key *key = sctx->cmp_key;
|
|
|
|
struct btrfs_inode_item *left_ii = NULL;
|
|
|
|
struct btrfs_inode_item *right_ii = NULL;
|
|
|
|
u64 left_gen = 0;
|
|
|
|
u64 right_gen = 0;
|
|
|
|
|
|
|
|
sctx->cur_ino = key->objectid;
|
|
|
|
sctx->cur_inode_new_gen = 0;
|
2013-10-23 00:18:51 +08:00
|
|
|
sctx->cur_inode_last_extent = (u64)-1;
|
2012-07-28 22:09:35 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Set send_progress to current inode. This will tell all get_cur_xxx
|
|
|
|
* functions that the current inode's refs are not updated yet. Later,
|
|
|
|
* when process_recorded_refs is finished, it is set to cur_ino + 1.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->send_progress = sctx->cur_ino;
|
|
|
|
|
|
|
|
if (result == BTRFS_COMPARE_TREE_NEW ||
|
|
|
|
result == BTRFS_COMPARE_TREE_CHANGED) {
|
|
|
|
left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
|
|
|
|
sctx->left_path->slots[0],
|
|
|
|
struct btrfs_inode_item);
|
|
|
|
left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
|
|
|
|
left_ii);
|
|
|
|
} else {
|
|
|
|
right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
|
|
|
|
sctx->right_path->slots[0],
|
|
|
|
struct btrfs_inode_item);
|
|
|
|
right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
|
|
|
|
right_ii);
|
|
|
|
}
|
|
|
|
if (result == BTRFS_COMPARE_TREE_CHANGED) {
|
|
|
|
right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
|
|
|
|
sctx->right_path->slots[0],
|
|
|
|
struct btrfs_inode_item);
|
|
|
|
|
|
|
|
right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
|
|
|
|
right_ii);
|
2012-08-01 20:48:59 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The cur_ino = root dir case is special here. We can't treat
|
|
|
|
* the inode as deleted+reused because it would generate a
|
|
|
|
* stream that tries to delete/mkdir the root dir.
|
|
|
|
*/
|
|
|
|
if (left_gen != right_gen &&
|
|
|
|
sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->cur_inode_new_gen = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result == BTRFS_COMPARE_TREE_NEW) {
|
|
|
|
sctx->cur_inode_gen = left_gen;
|
|
|
|
sctx->cur_inode_new = 1;
|
|
|
|
sctx->cur_inode_deleted = 0;
|
|
|
|
sctx->cur_inode_size = btrfs_inode_size(
|
|
|
|
sctx->left_path->nodes[0], left_ii);
|
|
|
|
sctx->cur_inode_mode = btrfs_inode_mode(
|
|
|
|
sctx->left_path->nodes[0], left_ii);
|
2014-02-27 17:29:01 +08:00
|
|
|
sctx->cur_inode_rdev = btrfs_inode_rdev(
|
|
|
|
sctx->left_path->nodes[0], left_ii);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
|
2012-07-28 16:42:24 +08:00
|
|
|
ret = send_create_inode_if_needed(sctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
} else if (result == BTRFS_COMPARE_TREE_DELETED) {
|
|
|
|
sctx->cur_inode_gen = right_gen;
|
|
|
|
sctx->cur_inode_new = 0;
|
|
|
|
sctx->cur_inode_deleted = 1;
|
|
|
|
sctx->cur_inode_size = btrfs_inode_size(
|
|
|
|
sctx->right_path->nodes[0], right_ii);
|
|
|
|
sctx->cur_inode_mode = btrfs_inode_mode(
|
|
|
|
sctx->right_path->nodes[0], right_ii);
|
|
|
|
} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* We need to do some special handling in case the inode was
|
|
|
|
* reported as changed with a changed generation number. This
|
|
|
|
* means that the original inode was deleted and new inode
|
|
|
|
* reused the same inum. So we have to treat the old inode as
|
|
|
|
* deleted and the new one as new.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
if (sctx->cur_inode_new_gen) {
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* First, process the inode as if it was deleted.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->cur_inode_gen = right_gen;
|
|
|
|
sctx->cur_inode_new = 0;
|
|
|
|
sctx->cur_inode_deleted = 1;
|
|
|
|
sctx->cur_inode_size = btrfs_inode_size(
|
|
|
|
sctx->right_path->nodes[0], right_ii);
|
|
|
|
sctx->cur_inode_mode = btrfs_inode_mode(
|
|
|
|
sctx->right_path->nodes[0], right_ii);
|
|
|
|
ret = process_all_refs(sctx,
|
|
|
|
BTRFS_COMPARE_TREE_DELETED);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Now process the inode as if it was new.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->cur_inode_gen = left_gen;
|
|
|
|
sctx->cur_inode_new = 1;
|
|
|
|
sctx->cur_inode_deleted = 0;
|
|
|
|
sctx->cur_inode_size = btrfs_inode_size(
|
|
|
|
sctx->left_path->nodes[0], left_ii);
|
|
|
|
sctx->cur_inode_mode = btrfs_inode_mode(
|
|
|
|
sctx->left_path->nodes[0], left_ii);
|
2014-02-27 17:29:01 +08:00
|
|
|
sctx->cur_inode_rdev = btrfs_inode_rdev(
|
|
|
|
sctx->left_path->nodes[0], left_ii);
|
2012-07-28 16:42:24 +08:00
|
|
|
ret = send_create_inode_if_needed(sctx);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-07-28 22:09:35 +08:00
|
|
|
/*
|
|
|
|
* Advance send_progress now as we did not get into
|
|
|
|
* process_recorded_refs_if_needed in the new_gen case.
|
|
|
|
*/
|
|
|
|
sctx->send_progress = sctx->cur_ino + 1;
|
2012-07-28 20:11:31 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Now process all extents and xattrs of the inode as if
|
|
|
|
* they were all new.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = process_all_extents(sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = process_all_new_xattrs(sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
sctx->cur_inode_gen = left_gen;
|
|
|
|
sctx->cur_inode_new = 0;
|
|
|
|
sctx->cur_inode_new_gen = 0;
|
|
|
|
sctx->cur_inode_deleted = 0;
|
|
|
|
sctx->cur_inode_size = btrfs_inode_size(
|
|
|
|
sctx->left_path->nodes[0], left_ii);
|
|
|
|
sctx->cur_inode_mode = btrfs_inode_mode(
|
|
|
|
sctx->left_path->nodes[0], left_ii);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* We have to process new refs before deleted refs, but compare_trees gives us
|
|
|
|
* the new and deleted refs mixed. To fix this, we record the new/deleted refs
|
|
|
|
* first and later process them in process_recorded_refs.
|
|
|
|
* For the cur_inode_new_gen case, we skip recording completely because
|
|
|
|
* changed_inode did already initiate processing of refs. The reason for this is
|
|
|
|
* that in this case, compare_tree actually compares the refs of 2 different
|
|
|
|
* inodes. To fix this, process_all_refs is used in changed_inode to handle all
|
|
|
|
* refs of the right tree as deleted and all refs of the left tree as new.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static int changed_ref(struct send_ctx *sctx,
|
|
|
|
enum btrfs_compare_tree_result result)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
Btrfs: send, don't bug on inconsistent snapshots
When doing an incremental send, if we find a new/modified/deleted extent,
reference or xattr without having previously processed the corresponding
inode item we end up exexuting a BUG_ON(). This is because whenever an
extent, xattr or reference is added, modified or deleted, we always expect
to have the corresponding inode item updated. However there are situations
where this will not happen due to transient -ENOMEM or -ENOSPC errors when
doing delayed inode updates.
For example, when punching holes we can succeed in deleting and modifying
(shrinking) extents but later fail to do the delayed inode update. So after
such failure we close our transaction handle and right after a snapshot of
the fs/subvol tree can be made and used later for a send operation. The
same thing can happen during truncate, link, unlink, and xattr related
operations.
So instead of executing a BUG_ON, make send return an -EIO error and print
an informative error message do dmesg/syslog.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2016-08-01 08:50:37 +08:00
|
|
|
if (sctx->cur_ino != sctx->cmp_key->objectid) {
|
|
|
|
inconsistent_snapshot_error(sctx, result, "reference");
|
|
|
|
return -EIO;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
if (!sctx->cur_inode_new_gen &&
|
|
|
|
sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
if (result == BTRFS_COMPARE_TREE_NEW)
|
|
|
|
ret = record_new_ref(sctx);
|
|
|
|
else if (result == BTRFS_COMPARE_TREE_DELETED)
|
|
|
|
ret = record_deleted_ref(sctx);
|
|
|
|
else if (result == BTRFS_COMPARE_TREE_CHANGED)
|
|
|
|
ret = record_changed_ref(sctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Process new/deleted/changed xattrs. We skip processing in the
|
|
|
|
* cur_inode_new_gen case because changed_inode did already initiate processing
|
|
|
|
* of xattrs. The reason is the same as in changed_ref
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static int changed_xattr(struct send_ctx *sctx,
|
|
|
|
enum btrfs_compare_tree_result result)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
Btrfs: send, don't bug on inconsistent snapshots
When doing an incremental send, if we find a new/modified/deleted extent,
reference or xattr without having previously processed the corresponding
inode item we end up exexuting a BUG_ON(). This is because whenever an
extent, xattr or reference is added, modified or deleted, we always expect
to have the corresponding inode item updated. However there are situations
where this will not happen due to transient -ENOMEM or -ENOSPC errors when
doing delayed inode updates.
For example, when punching holes we can succeed in deleting and modifying
(shrinking) extents but later fail to do the delayed inode update. So after
such failure we close our transaction handle and right after a snapshot of
the fs/subvol tree can be made and used later for a send operation. The
same thing can happen during truncate, link, unlink, and xattr related
operations.
So instead of executing a BUG_ON, make send return an -EIO error and print
an informative error message do dmesg/syslog.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2016-08-01 08:50:37 +08:00
|
|
|
if (sctx->cur_ino != sctx->cmp_key->objectid) {
|
|
|
|
inconsistent_snapshot_error(sctx, result, "xattr");
|
|
|
|
return -EIO;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
|
|
|
|
if (result == BTRFS_COMPARE_TREE_NEW)
|
|
|
|
ret = process_new_xattr(sctx);
|
|
|
|
else if (result == BTRFS_COMPARE_TREE_DELETED)
|
|
|
|
ret = process_deleted_xattr(sctx);
|
|
|
|
else if (result == BTRFS_COMPARE_TREE_CHANGED)
|
|
|
|
ret = process_changed_xattr(sctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Process new/deleted/changed extents. We skip processing in the
|
|
|
|
* cur_inode_new_gen case because changed_inode did already initiate processing
|
|
|
|
* of extents. The reason is the same as in changed_ref
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static int changed_extent(struct send_ctx *sctx,
|
|
|
|
enum btrfs_compare_tree_result result)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
Btrfs: send, don't bug on inconsistent snapshots
When doing an incremental send, if we find a new/modified/deleted extent,
reference or xattr without having previously processed the corresponding
inode item we end up exexuting a BUG_ON(). This is because whenever an
extent, xattr or reference is added, modified or deleted, we always expect
to have the corresponding inode item updated. However there are situations
where this will not happen due to transient -ENOMEM or -ENOSPC errors when
doing delayed inode updates.
For example, when punching holes we can succeed in deleting and modifying
(shrinking) extents but later fail to do the delayed inode update. So after
such failure we close our transaction handle and right after a snapshot of
the fs/subvol tree can be made and used later for a send operation. The
same thing can happen during truncate, link, unlink, and xattr related
operations.
So instead of executing a BUG_ON, make send return an -EIO error and print
an informative error message do dmesg/syslog.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2016-08-01 08:50:37 +08:00
|
|
|
if (sctx->cur_ino != sctx->cmp_key->objectid) {
|
Btrfs: fix incremental send failure caused by balance
Commit 951555856b88 ("Btrfs: send, don't bug on inconsistent snapshots")
removed some BUG_ON() statements (replacing them with returning errors
to user space and logging error messages) when a snapshot is in an
inconsistent state due to failures to update a delayed inode item (ENOMEM
or ENOSPC) after adding/updating/deleting references, xattrs or file
extent items.
However there is a case, when no errors happen, where a file extent item
can be modified without having the corresponding inode item updated. This
case happens during balance under very specific timings, when relocation
is in the stage where it updates data pointers and a leaf that contains
file extent items is COWed. When that happens file extent items get their
disk_bytenr field updated to a new value that reflects the post relocation
logical address of the extent, without updating their respective inode
items (as there is nothing that needs to be updated on them). This is
performed at relocation.c:replace_file_extents() through
relocation.c:btrfs_reloc_cow_block().
So make an incremental send deal with this case and don't do any processing
for a file extent item that got its disk_bytenr field updated by relocation,
since the extent's data is the same as the one pointed by the file extent
item in the parent snapshot.
After the recent commit mentioned above this case resulted in EIO errors
returned to user space (and an error message logged to dmesg/syslog) when
doing an incremental send, while before it, it resulted in hitting a
BUG_ON leading to the following trace:
[ 952.206705] ------------[ cut here ]------------
[ 952.206714] kernel BUG at ../fs/btrfs/send.c:5653!
[ 952.206719] Internal error: Oops - BUG: 0 [#1] SMP
[ 952.209854] Modules linked in: st dm_mod nls_utf8 isofs fuse nf_log_ipv6 xt_pkttype xt_physdev br_netfilter nf_log_ipv4 nf_log_common xt_LOG xt_limit ebtable_filter ebtables af_packet bridge stp llc ip6t_REJECT xt_tcpudp nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_raw ipt_REJECT iptable_raw xt_CT iptable_filter ip6table_mangle nf_conntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_ipv4 nf_defrag_ipv4 ip_tables xt_conntrack nf_conntrack ip6table_filter ip6_tables x_tables xfs libcrc32c nls_iso8859_1 nls_cp437 vfat fat joydev aes_ce_blk ablk_helper cryptd snd_intel8x0 aes_ce_cipher snd_ac97_codec ac97_bus snd_pcm ghash_ce sha2_ce sha1_ce snd_timer snd virtio_net soundcore btrfs xor sr_mod cdrom hid_generic usbhid raid6_pq virtio_blk virtio_scsi bochs_drm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm virtio_mmio xhci_pci xhci_hcd usbcore usb_common virtio_pci virtio_ring virtio drm sg efivarfs
[ 952.228333] Supported: Yes
[ 952.228908] CPU: 0 PID: 12779 Comm: snapperd Not tainted 4.4.14-50-default #1
[ 952.230329] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
[ 952.231683] task: ffff800058e94100 ti: ffff8000d866c000 task.ti: ffff8000d866c000
[ 952.233279] PC is at changed_cb+0x9f4/0xa48 [btrfs]
[ 952.234375] LR is at changed_cb+0x58/0xa48 [btrfs]
[ 952.236552] pc : [<ffff7ffffc39de7c>] lr : [<ffff7ffffc39d4e0>] pstate: 80000145
[ 952.238049] sp : ffff8000d866fa20
[ 952.238732] x29: ffff8000d866fa20 x28: 0000000000000019
[ 952.239840] x27: 00000000000028d5 x26: 00000000000024a2
[ 952.241008] x25: 0000000000000002 x24: ffff8000e66e92f0
[ 952.242131] x23: ffff8000b8c76800 x22: ffff800092879140
[ 952.243238] x21: 0000000000000002 x20: ffff8000d866fb78
[ 952.244348] x19: ffff8000b8f8c200 x18: 0000000000002710
[ 952.245607] x17: 0000ffff90d42480 x16: ffff800000237dc0
[ 952.246719] x15: 0000ffff90de7510 x14: ab000c000a2faf08
[ 952.247835] x13: 0000000000577c2b x12: ab000c000b696665
[ 952.248981] x11: 2e65726f632f6966 x10: 652d34366d72612f
[ 952.250101] x9 : 32627572672f746f x8 : ab000c00092f1671
[ 952.251352] x7 : 8000000000577c2b x6 : ffff800053eadf45
[ 952.252468] x5 : 0000000000000000 x4 : ffff80005e169494
[ 952.253582] x3 : 0000000000000004 x2 : ffff8000d866fb78
[ 952.254695] x1 : 000000000003e2a3 x0 : 000000000003e2a4
[ 952.255803]
[ 952.256150] Process snapperd (pid: 12779, stack limit = 0xffff8000d866c020)
[ 952.257516] Stack: (0xffff8000d866fa20 to 0xffff8000d8670000)
[ 952.258654] fa20: ffff8000d866fae0 ffff7ffffc308fc0 ffff800092879140 ffff8000e66e92f0
[ 952.260219] fa40: 0000000000000035 ffff800055de6000 ffff8000b8c76800 ffff8000d866fb78
[ 952.261745] fa60: 0000000000000002 00000000000024a2 00000000000028d5 0000000000000019
[ 952.263269] fa80: ffff8000d866fae0 ffff7ffffc3090f0 ffff8000d866fae0 ffff7ffffc309128
[ 952.264797] faa0: ffff800092879140 ffff8000e66e92f0 0000000000000035 ffff800055de6000
[ 952.268261] fac0: ffff8000b8c76800 ffff8000d866fb78 0000000000000002 0000000000001000
[ 952.269822] fae0: ffff8000d866fbc0 ffff7ffffc39ecfc ffff8000b8f8c200 ffff8000b8f8c368
[ 952.271368] fb00: ffff8000b8f8c378 ffff800055de6000 0000000000000001 ffff8000ecb17500
[ 952.272893] fb20: ffff8000b8c76800 ffff800092879140 ffff800062b6d000 ffff80007a9e2470
[ 952.274420] fb40: ffff8000b8f8c208 0000000005784000 ffff8000580a8000 ffff8000b8f8c200
[ 952.276088] fb60: ffff7ffffc39d488 00000002b8f8c368 0000000000000000 000000000003e2a4
[ 952.280275] fb80: 000000000000006c ffff7ffffc39ec00 000000000003e2a4 000000000000006c
[ 952.283219] fba0: ffff8000b8f8c300 0000000000000100 0000000000000001 ffff8000ecb17500
[ 952.286166] fbc0: ffff8000d866fcd0 ffff7ffffc3643c0 ffff8000f8842700 0000ffff8ffe9278
[ 952.289136] fbe0: 0000000040489426 ffff800055de6000 0000ffff8ffe9278 0000000040489426
[ 952.292083] fc00: 000000000000011d 000000000000001d ffff80007a9e4598 ffff80007a9e43e8
[ 952.294959] fc20: ffff8000b8c7693f 0000000000003b24 0000000000000019 ffff8000b8f8c218
[ 952.301161] fc40: 00000001d866fc70 ffff8000b8c76800 0000000000000128 ffffffffffffff84
[ 952.305749] fc60: ffff800058e941ff 0000000000003a58 ffff8000d866fcb0 ffff8000000f7390
[ 952.308875] fc80: 000000000000012a 0000000000010290 ffff8000d866fc00 000000000000007b
[ 952.311915] fca0: 0000000000010290 ffff800046c1b100 74732d7366727462 000001006d616572
[ 952.314937] fcc0: ffff8000fffc4100 cb88537fdc8ba60e ffff8000d866fe10 ffff8000002499e8
[ 952.318008] fce0: 0000000040489426 ffff8000f8842700 0000ffff8ffe9278 ffff80007a9e4598
[ 952.321321] fd00: 0000ffff8ffe9278 0000000040489426 000000000000011d 000000000000001d
[ 952.324280] fd20: ffff80000072c000 ffff8000d866c000 ffff8000d866fda0 ffff8000000e997c
[ 952.327156] fd40: ffff8000fffc4180 00000000000031ed ffff8000fffc4180 ffff800046c1b7d4
[ 952.329895] fd60: 0000000000000140 0000ffff907ea170 000000000000011d 00000000000000dc
[ 952.334641] fd80: ffff80000072c000 ffff8000d866c000 0000000000000000 0000000000000002
[ 952.338002] fda0: ffff8000d866fdd0 ffff8000000ebacc ffff800046c1b080 ffff800046c1b7d4
[ 952.340724] fdc0: ffff8000d866fdf0 ffff8000000db67c 0000000000000040 ffff800000e69198
[ 952.343415] fde0: 0000ffff8ffea790 00000000000031ed ffff8000d866fe20 ffff800000254000
[ 952.346101] fe00: 000000000000001d 0000000000000004 ffff8000d866fe90 ffff800000249d3c
[ 952.348980] fe20: ffff8000f8842700 0000000000000000 ffff8000f8842701 0000000000000008
[ 952.351696] fe40: ffff8000d866fe70 0000000000000008 ffff8000d866fe90 ffff800000249cf8
[ 952.354387] fe60: ffff8000f8842700 0000ffff8ffe9170 ffff8000f8842701 0000000000000008
[ 952.357083] fe80: 0000ffff8ffe9278 ffff80008ff85500 0000ffff8ffe90c0 ffff800000085c84
[ 952.359800] fea0: 0000000000000000 0000ffff8ffe9170 ffffffffffffffff 0000ffff90d473bc
[ 952.365351] fec0: 0000000000000000 0000000000000015 0000000000000008 0000000040489426
[ 952.369550] fee0: 0000ffff8ffe9278 0000ffff907ea790 0000ffff907ea170 0000ffff907ea790
[ 952.372416] ff00: 0000ffff907ea170 0000000000000000 000000000000001d 0000000000000004
[ 952.375223] ff20: 0000ffff90a32220 00000000003d0f00 0000ffff907ea0a0 0000ffff8ffe8f30
[ 952.378099] ff40: 0000ffff9100f554 0000ffff91147000 0000ffff91117bc0 0000ffff90d473b0
[ 952.381115] ff60: 0000ffff9100f620 0000ffff880069b0 0000ffff8ffe9170 0000ffff8ffe91a0
[ 952.384003] ff80: 0000ffff8ffe9160 0000ffff8ffe9140 0000ffff88006990 0000ffff8ffe9278
[ 952.386860] ffa0: 0000ffff88008a60 0000ffff8ffe9480 0000ffff88014ca0 0000ffff8ffe90c0
[ 952.389654] ffc0: 0000ffff910be8e8 0000ffff8ffe90c0 0000ffff90d473bc 0000000000000000
[ 952.410986] ffe0: 0000000000000008 000000000000001d 6e2079747265706f 72616d223d656d61
[ 952.415497] Call trace:
[ 952.417403] [<ffff7ffffc39de7c>] changed_cb+0x9f4/0xa48 [btrfs]
[ 952.420023] [<ffff7ffffc308fc0>] btrfs_compare_trees+0x500/0x6b0 [btrfs]
[ 952.422759] [<ffff7ffffc39ecfc>] btrfs_ioctl_send+0xb4c/0xe10 [btrfs]
[ 952.425601] [<ffff7ffffc3643c0>] btrfs_ioctl+0x374/0x29a4 [btrfs]
[ 952.428031] [<ffff8000002499e8>] do_vfs_ioctl+0x33c/0x600
[ 952.430360] [<ffff800000249d3c>] SyS_ioctl+0x90/0xa4
[ 952.432552] [<ffff800000085c84>] el0_svc_naked+0x38/0x3c
[ 952.434803] Code: 2a1503e0 17fffdac b9404282 17ffff28 (d4210000)
[ 952.437457] ---[ end trace 9afd7090c466cf15 ]---
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2016-09-19 17:57:40 +08:00
|
|
|
|
|
|
|
if (result == BTRFS_COMPARE_TREE_CHANGED) {
|
|
|
|
struct extent_buffer *leaf_l;
|
|
|
|
struct extent_buffer *leaf_r;
|
|
|
|
struct btrfs_file_extent_item *ei_l;
|
|
|
|
struct btrfs_file_extent_item *ei_r;
|
|
|
|
|
|
|
|
leaf_l = sctx->left_path->nodes[0];
|
|
|
|
leaf_r = sctx->right_path->nodes[0];
|
|
|
|
ei_l = btrfs_item_ptr(leaf_l,
|
|
|
|
sctx->left_path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
ei_r = btrfs_item_ptr(leaf_r,
|
|
|
|
sctx->right_path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We may have found an extent item that has changed
|
|
|
|
* only its disk_bytenr field and the corresponding
|
|
|
|
* inode item was not updated. This case happens due to
|
|
|
|
* very specific timings during relocation when a leaf
|
|
|
|
* that contains file extent items is COWed while
|
|
|
|
* relocation is ongoing and its in the stage where it
|
|
|
|
* updates data pointers. So when this happens we can
|
|
|
|
* safely ignore it since we know it's the same extent,
|
|
|
|
* but just at different logical and physical locations
|
|
|
|
* (when an extent is fully replaced with a new one, we
|
|
|
|
* know the generation number must have changed too,
|
|
|
|
* since snapshot creation implies committing the current
|
|
|
|
* transaction, and the inode item must have been updated
|
|
|
|
* as well).
|
|
|
|
* This replacement of the disk_bytenr happens at
|
|
|
|
* relocation.c:replace_file_extents() through
|
|
|
|
* relocation.c:btrfs_reloc_cow_block().
|
|
|
|
*/
|
|
|
|
if (btrfs_file_extent_generation(leaf_l, ei_l) ==
|
|
|
|
btrfs_file_extent_generation(leaf_r, ei_r) &&
|
|
|
|
btrfs_file_extent_ram_bytes(leaf_l, ei_l) ==
|
|
|
|
btrfs_file_extent_ram_bytes(leaf_r, ei_r) &&
|
|
|
|
btrfs_file_extent_compression(leaf_l, ei_l) ==
|
|
|
|
btrfs_file_extent_compression(leaf_r, ei_r) &&
|
|
|
|
btrfs_file_extent_encryption(leaf_l, ei_l) ==
|
|
|
|
btrfs_file_extent_encryption(leaf_r, ei_r) &&
|
|
|
|
btrfs_file_extent_other_encoding(leaf_l, ei_l) ==
|
|
|
|
btrfs_file_extent_other_encoding(leaf_r, ei_r) &&
|
|
|
|
btrfs_file_extent_type(leaf_l, ei_l) ==
|
|
|
|
btrfs_file_extent_type(leaf_r, ei_r) &&
|
|
|
|
btrfs_file_extent_disk_bytenr(leaf_l, ei_l) !=
|
|
|
|
btrfs_file_extent_disk_bytenr(leaf_r, ei_r) &&
|
|
|
|
btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) ==
|
|
|
|
btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) &&
|
|
|
|
btrfs_file_extent_offset(leaf_l, ei_l) ==
|
|
|
|
btrfs_file_extent_offset(leaf_r, ei_r) &&
|
|
|
|
btrfs_file_extent_num_bytes(leaf_l, ei_l) ==
|
|
|
|
btrfs_file_extent_num_bytes(leaf_r, ei_r))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
Btrfs: send, don't bug on inconsistent snapshots
When doing an incremental send, if we find a new/modified/deleted extent,
reference or xattr without having previously processed the corresponding
inode item we end up exexuting a BUG_ON(). This is because whenever an
extent, xattr or reference is added, modified or deleted, we always expect
to have the corresponding inode item updated. However there are situations
where this will not happen due to transient -ENOMEM or -ENOSPC errors when
doing delayed inode updates.
For example, when punching holes we can succeed in deleting and modifying
(shrinking) extents but later fail to do the delayed inode update. So after
such failure we close our transaction handle and right after a snapshot of
the fs/subvol tree can be made and used later for a send operation. The
same thing can happen during truncate, link, unlink, and xattr related
operations.
So instead of executing a BUG_ON, make send return an -EIO error and print
an informative error message do dmesg/syslog.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2016-08-01 08:50:37 +08:00
|
|
|
inconsistent_snapshot_error(sctx, result, "extent");
|
|
|
|
return -EIO;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
|
|
|
|
if (result != BTRFS_COMPARE_TREE_DELETED)
|
|
|
|
ret = process_extent(sctx, sctx->left_path,
|
|
|
|
sctx->cmp_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-08-17 04:52:55 +08:00
|
|
|
static int dir_changed(struct send_ctx *sctx, u64 dir)
|
|
|
|
{
|
|
|
|
u64 orig_gen, new_gen;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
|
|
|
|
NULL, NULL);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
|
|
|
|
NULL, NULL, NULL);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return (orig_gen != new_gen) ? 1 : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
|
|
|
|
struct btrfs_key *key)
|
|
|
|
{
|
|
|
|
struct btrfs_inode_extref *extref;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
u64 dirid = 0, last_dirid = 0;
|
|
|
|
unsigned long ptr;
|
|
|
|
u32 item_size;
|
|
|
|
u32 cur_offset = 0;
|
|
|
|
int ref_name_len;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/* Easy case, just check this one dirid */
|
|
|
|
if (key->type == BTRFS_INODE_REF_KEY) {
|
|
|
|
dirid = key->offset;
|
|
|
|
|
|
|
|
ret = dir_changed(sctx, dirid);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
|
|
|
|
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
|
|
|
|
while (cur_offset < item_size) {
|
|
|
|
extref = (struct btrfs_inode_extref *)(ptr +
|
|
|
|
cur_offset);
|
|
|
|
dirid = btrfs_inode_extref_parent(leaf, extref);
|
|
|
|
ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
|
|
|
|
cur_offset += ref_name_len + sizeof(*extref);
|
|
|
|
if (dirid == last_dirid)
|
|
|
|
continue;
|
|
|
|
ret = dir_changed(sctx, dirid);
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
last_dirid = dirid;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-28 20:11:31 +08:00
|
|
|
/*
|
|
|
|
* Updates compare related fields in sctx and simply forwards to the actual
|
|
|
|
* changed_xxx functions.
|
|
|
|
*/
|
2012-07-26 05:19:24 +08:00
|
|
|
static int changed_cb(struct btrfs_root *left_root,
|
|
|
|
struct btrfs_root *right_root,
|
|
|
|
struct btrfs_path *left_path,
|
|
|
|
struct btrfs_path *right_path,
|
|
|
|
struct btrfs_key *key,
|
|
|
|
enum btrfs_compare_tree_result result,
|
|
|
|
void *ctx)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct send_ctx *sctx = ctx;
|
|
|
|
|
2013-08-17 04:52:55 +08:00
|
|
|
if (result == BTRFS_COMPARE_TREE_SAME) {
|
2013-10-23 00:18:51 +08:00
|
|
|
if (key->type == BTRFS_INODE_REF_KEY ||
|
|
|
|
key->type == BTRFS_INODE_EXTREF_KEY) {
|
|
|
|
ret = compare_refs(sctx, left_path, key);
|
|
|
|
if (!ret)
|
|
|
|
return 0;
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
|
|
|
|
return maybe_send_hole(sctx, left_path, key);
|
|
|
|
} else {
|
2013-08-17 04:52:55 +08:00
|
|
|
return 0;
|
2013-10-23 00:18:51 +08:00
|
|
|
}
|
2013-08-17 04:52:55 +08:00
|
|
|
result = BTRFS_COMPARE_TREE_CHANGED;
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->left_path = left_path;
|
|
|
|
sctx->right_path = right_path;
|
|
|
|
sctx->cmp_key = key;
|
|
|
|
|
|
|
|
ret = finish_inode_if_needed(sctx, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2012-08-01 20:47:03 +08:00
|
|
|
/* Ignore non-FS objects */
|
|
|
|
if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
|
|
|
|
key->objectid == BTRFS_FREE_SPACE_OBJECTID)
|
|
|
|
goto out;
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
if (key->type == BTRFS_INODE_ITEM_KEY)
|
|
|
|
ret = changed_inode(sctx, result);
|
2012-10-15 16:30:45 +08:00
|
|
|
else if (key->type == BTRFS_INODE_REF_KEY ||
|
|
|
|
key->type == BTRFS_INODE_EXTREF_KEY)
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = changed_ref(sctx, result);
|
|
|
|
else if (key->type == BTRFS_XATTR_ITEM_KEY)
|
|
|
|
ret = changed_xattr(sctx, result);
|
|
|
|
else if (key->type == BTRFS_EXTENT_DATA_KEY)
|
|
|
|
ret = changed_extent(sctx, result);
|
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int full_send_tree(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_root *send_root = sctx->send_root;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
int slot;
|
|
|
|
|
|
|
|
path = alloc_path_for_send();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
|
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret)
|
|
|
|
goto out_finish;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
btrfs_item_key_to_cpu(eb, &found_key, slot);
|
|
|
|
|
|
|
|
ret = changed_cb(send_root, NULL, path, NULL,
|
|
|
|
&found_key, BTRFS_COMPARE_TREE_NEW, sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
key.objectid = found_key.objectid;
|
|
|
|
key.type = found_key.type;
|
|
|
|
key.offset = found_key.offset + 1;
|
|
|
|
|
|
|
|
ret = btrfs_next_item(send_root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret) {
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
out_finish:
|
|
|
|
ret = finish_inode_if_needed(sctx, 1);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int send_subvol(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2013-04-11 01:10:52 +08:00
|
|
|
if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
|
|
|
|
ret = send_header(sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
ret = send_subvol_begin(sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (sctx->parent_root) {
|
|
|
|
ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
|
|
|
|
changed_cb, sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = finish_inode_if_needed(sctx, 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
ret = full_send_tree(sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
free_recorded_refs(sctx);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-10-21 18:11:41 +08:00
|
|
|
/*
|
|
|
|
* If orphan cleanup did remove any orphans from a root, it means the tree
|
|
|
|
* was modified and therefore the commit root is not the same as the current
|
|
|
|
* root anymore. This is a problem, because send uses the commit root and
|
|
|
|
* therefore can see inode items that don't exist in the current root anymore,
|
|
|
|
* and for example make calls to btrfs_iget, which will do tree lookups based
|
|
|
|
* on the current root and not on the commit root. Those lookups will fail,
|
|
|
|
* returning a -ESTALE error, and making send fail with that error. So make
|
|
|
|
* sure a send does not see any orphans we have just removed, and that it will
|
|
|
|
* see the same inodes regardless of whether a transaction commit happened
|
|
|
|
* before it started (meaning that the commit root will be the same as the
|
|
|
|
* current root) or not.
|
|
|
|
*/
|
|
|
|
static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
|
|
|
|
|
|
|
again:
|
|
|
|
if (sctx->parent_root &&
|
|
|
|
sctx->parent_root->node != sctx->parent_root->commit_root)
|
|
|
|
goto commit_trans;
|
|
|
|
|
|
|
|
for (i = 0; i < sctx->clone_roots_cnt; i++)
|
|
|
|
if (sctx->clone_roots[i].root->node !=
|
|
|
|
sctx->clone_roots[i].root->commit_root)
|
|
|
|
goto commit_trans;
|
|
|
|
|
|
|
|
if (trans)
|
2016-09-10 09:39:03 +08:00
|
|
|
return btrfs_end_transaction(trans);
|
2014-10-21 18:11:41 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
commit_trans:
|
|
|
|
/* Use any root, all fs roots will get their commit roots updated. */
|
|
|
|
if (!trans) {
|
|
|
|
trans = btrfs_join_transaction(sctx->send_root);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
2016-09-10 09:39:03 +08:00
|
|
|
return btrfs_commit_transaction(trans);
|
2014-10-21 18:11:41 +08:00
|
|
|
}
|
|
|
|
|
2013-12-17 22:07:20 +08:00
|
|
|
static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
|
|
|
|
{
|
|
|
|
spin_lock(&root->root_item_lock);
|
|
|
|
root->send_in_progress--;
|
|
|
|
/*
|
|
|
|
* Not much left to do, we don't know why it's unbalanced and
|
|
|
|
* can't blindly reset it to 0.
|
|
|
|
*/
|
|
|
|
if (root->send_in_progress < 0)
|
|
|
|
btrfs_err(root->fs_info,
|
2016-06-23 06:54:23 +08:00
|
|
|
"send_in_progres unbalanced %d root %llu",
|
|
|
|
root->send_in_progress, root->root_key.objectid);
|
2013-12-17 22:07:20 +08:00
|
|
|
spin_unlock(&root->root_item_lock);
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
|
|
|
|
struct btrfs_fs_info *fs_info = send_root->fs_info;
|
2012-07-26 05:19:24 +08:00
|
|
|
struct btrfs_root *clone_root;
|
|
|
|
struct btrfs_ioctl_send_args *arg = NULL;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct send_ctx *sctx = NULL;
|
|
|
|
u32 i;
|
|
|
|
u64 *clone_sources_tmp = NULL;
|
2013-12-17 00:34:17 +08:00
|
|
|
int clone_sources_to_rollback = 0;
|
2016-04-12 00:52:02 +08:00
|
|
|
unsigned alloc_size;
|
2014-01-07 17:25:18 +08:00
|
|
|
int sort_clone_roots = 0;
|
2014-01-07 17:25:19 +08:00
|
|
|
int index;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
|
|
return -EPERM;
|
|
|
|
|
2013-12-17 00:34:17 +08:00
|
|
|
/*
|
|
|
|
* The subvolume must remain read-only during send, protect against
|
2014-04-15 22:41:44 +08:00
|
|
|
* making it RW. This also protects against deletion.
|
2013-12-17 00:34:17 +08:00
|
|
|
*/
|
|
|
|
spin_lock(&send_root->root_item_lock);
|
|
|
|
send_root->send_in_progress++;
|
|
|
|
spin_unlock(&send_root->root_item_lock);
|
|
|
|
|
2013-05-20 23:26:50 +08:00
|
|
|
/*
|
|
|
|
* This is done when we lookup the root, it should already be complete
|
|
|
|
* by the time we get here.
|
|
|
|
*/
|
|
|
|
WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
|
|
|
|
|
2013-12-17 00:34:17 +08:00
|
|
|
/*
|
|
|
|
* Userspace tools do the checks and warn the user if it's
|
|
|
|
* not RO.
|
|
|
|
*/
|
|
|
|
if (!btrfs_root_readonly(send_root)) {
|
|
|
|
ret = -EPERM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
arg = memdup_user(arg_, sizeof(*arg));
|
|
|
|
if (IS_ERR(arg)) {
|
|
|
|
ret = PTR_ERR(arg);
|
|
|
|
arg = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-04-13 14:40:59 +08:00
|
|
|
if (arg->clone_sources_count >
|
|
|
|
ULLONG_MAX / sizeof(*arg->clone_sources)) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!access_ok(VERIFY_READ, arg->clone_sources,
|
2013-01-10 16:57:25 +08:00
|
|
|
sizeof(*arg->clone_sources) *
|
|
|
|
arg->clone_sources_count)) {
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = -EFAULT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2013-04-11 01:10:52 +08:00
|
|
|
if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
|
2013-02-05 04:54:57 +08:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-01-19 01:42:13 +08:00
|
|
|
sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!sctx) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&sctx->new_refs);
|
|
|
|
INIT_LIST_HEAD(&sctx->deleted_refs);
|
2016-01-19 01:42:13 +08:00
|
|
|
INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
|
2012-07-26 05:19:24 +08:00
|
|
|
INIT_LIST_HEAD(&sctx->name_cache_list);
|
|
|
|
|
2013-02-05 04:54:57 +08:00
|
|
|
sctx->flags = arg->flags;
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->send_filp = fget(arg->send_fd);
|
2013-04-19 09:04:46 +08:00
|
|
|
if (!sctx->send_filp) {
|
|
|
|
ret = -EBADF;
|
2012-07-26 05:19:24 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
sctx->send_root = send_root;
|
2014-04-15 22:41:44 +08:00
|
|
|
/*
|
|
|
|
* Unlikely but possible, if the subvolume is marked for deletion but
|
|
|
|
* is slow to remove the directory entry, send can still be started
|
|
|
|
*/
|
|
|
|
if (btrfs_root_dead(sctx->send_root)) {
|
|
|
|
ret = -EPERM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->clone_roots_cnt = arg->clone_sources_count;
|
|
|
|
|
|
|
|
sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
|
2016-04-12 00:40:08 +08:00
|
|
|
sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!sctx->send_buf) {
|
2016-04-12 00:40:08 +08:00
|
|
|
sctx->send_buf = vmalloc(sctx->send_max_size);
|
|
|
|
if (!sctx->send_buf) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
2016-04-12 00:40:08 +08:00
|
|
|
sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!sctx->read_buf) {
|
2016-04-12 00:40:08 +08:00
|
|
|
sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
|
|
|
|
if (!sctx->read_buf) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
sctx->pending_dir_moves = RB_ROOT;
|
|
|
|
sctx->waiting_dir_moves = RB_ROOT;
|
2014-02-19 22:31:44 +08:00
|
|
|
sctx->orphan_dirs = RB_ROOT;
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
|
2016-04-12 00:52:02 +08:00
|
|
|
alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
|
|
|
|
|
2016-04-12 00:40:08 +08:00
|
|
|
sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!sctx->clone_roots) {
|
2016-04-12 00:40:08 +08:00
|
|
|
sctx->clone_roots = vzalloc(alloc_size);
|
|
|
|
if (!sctx->clone_roots) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
2016-04-12 00:52:02 +08:00
|
|
|
alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
if (arg->clone_sources_count) {
|
2016-04-12 00:40:08 +08:00
|
|
|
clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (!clone_sources_tmp) {
|
2016-04-12 00:40:08 +08:00
|
|
|
clone_sources_tmp = vmalloc(alloc_size);
|
|
|
|
if (!clone_sources_tmp) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
|
2016-04-12 00:52:02 +08:00
|
|
|
alloc_size);
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret) {
|
|
|
|
ret = -EFAULT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < arg->clone_sources_count; i++) {
|
|
|
|
key.objectid = clone_sources_tmp[i];
|
|
|
|
key.type = BTRFS_ROOT_ITEM_KEY;
|
|
|
|
key.offset = (u64)-1;
|
2014-01-07 17:25:19 +08:00
|
|
|
|
|
|
|
index = srcu_read_lock(&fs_info->subvol_srcu);
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
|
|
|
|
if (IS_ERR(clone_root)) {
|
2014-01-07 17:25:19 +08:00
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = PTR_ERR(clone_root);
|
|
|
|
goto out;
|
|
|
|
}
|
2013-12-17 00:34:17 +08:00
|
|
|
spin_lock(&clone_root->root_item_lock);
|
2015-03-03 04:53:52 +08:00
|
|
|
if (!btrfs_root_readonly(clone_root) ||
|
|
|
|
btrfs_root_dead(clone_root)) {
|
2013-12-17 00:34:17 +08:00
|
|
|
spin_unlock(&clone_root->root_item_lock);
|
2014-01-07 17:25:19 +08:00
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
2013-12-17 00:34:17 +08:00
|
|
|
ret = -EPERM;
|
|
|
|
goto out;
|
|
|
|
}
|
2015-03-03 04:53:53 +08:00
|
|
|
clone_root->send_in_progress++;
|
2013-12-17 00:34:17 +08:00
|
|
|
spin_unlock(&clone_root->root_item_lock);
|
2014-01-07 17:25:19 +08:00
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->clone_roots[i].root = clone_root;
|
2015-03-03 04:53:53 +08:00
|
|
|
clone_sources_to_rollback = i + 1;
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
2016-04-12 00:40:08 +08:00
|
|
|
kvfree(clone_sources_tmp);
|
2012-07-26 05:19:24 +08:00
|
|
|
clone_sources_tmp = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (arg->parent_root) {
|
|
|
|
key.objectid = arg->parent_root;
|
|
|
|
key.type = BTRFS_ROOT_ITEM_KEY;
|
|
|
|
key.offset = (u64)-1;
|
2014-01-07 17:25:19 +08:00
|
|
|
|
|
|
|
index = srcu_read_lock(&fs_info->subvol_srcu);
|
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
|
2013-05-13 22:42:57 +08:00
|
|
|
if (IS_ERR(sctx->parent_root)) {
|
2014-01-07 17:25:19 +08:00
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
2013-05-13 22:42:57 +08:00
|
|
|
ret = PTR_ERR(sctx->parent_root);
|
2012-07-26 05:19:24 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2014-01-07 17:25:19 +08:00
|
|
|
|
2013-12-17 00:34:17 +08:00
|
|
|
spin_lock(&sctx->parent_root->root_item_lock);
|
|
|
|
sctx->parent_root->send_in_progress++;
|
2014-04-15 22:41:44 +08:00
|
|
|
if (!btrfs_root_readonly(sctx->parent_root) ||
|
|
|
|
btrfs_root_dead(sctx->parent_root)) {
|
2013-12-17 00:34:17 +08:00
|
|
|
spin_unlock(&sctx->parent_root->root_item_lock);
|
2014-01-07 17:25:19 +08:00
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
2013-12-17 00:34:17 +08:00
|
|
|
ret = -EPERM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
spin_unlock(&sctx->parent_root->root_item_lock);
|
2014-01-07 17:25:19 +08:00
|
|
|
|
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
2012-07-26 05:19:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clones from send_root are allowed, but only if the clone source
|
|
|
|
* is behind the current send position. This is checked while searching
|
|
|
|
* for possible clone sources.
|
|
|
|
*/
|
|
|
|
sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
|
|
|
|
|
|
|
|
/* We do a bsearch later */
|
|
|
|
sort(sctx->clone_roots, sctx->clone_roots_cnt,
|
|
|
|
sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
|
|
|
|
NULL);
|
2014-01-07 17:25:18 +08:00
|
|
|
sort_clone_roots = 1;
|
2012-07-26 05:19:24 +08:00
|
|
|
|
2014-10-21 18:11:41 +08:00
|
|
|
ret = ensure_commit_roots_uptodate(sctx);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2014-07-31 06:43:18 +08:00
|
|
|
current->journal_info = BTRFS_SEND_TRANS_STUB;
|
2012-07-26 05:19:24 +08:00
|
|
|
ret = send_subvol(sctx);
|
2014-03-29 05:07:27 +08:00
|
|
|
current->journal_info = NULL;
|
2012-07-26 05:19:24 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2013-04-11 01:10:52 +08:00
|
|
|
if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
|
|
|
|
ret = begin_cmd(sctx, BTRFS_SEND_C_END);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = send_cmd(sctx);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
out:
|
Btrfs: fix infinite path build loops in incremental send
The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.
This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:
WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
CPU: 1 PID: 5705 Comm: btrfs Tainted: G O 3.13.0-rc7-fdm-btrfs-next-18+ #3
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 5381.660441] 00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
[ 5381.660447] 0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
[ 5381.660452] 0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
[ 5381.660457] Call Trace:
[ 5381.660464] [<ffffffff81777434>] dump_stack+0x4e/0x68
[ 5381.660471] [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
[ 5381.660476] [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
[ 5381.660480] [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
[ 5381.660487] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660491] [<ffffffff811430e8>] ? free_one_page+0x98/0x440
[ 5381.660495] [<ffffffff8108313f>] ? local_clock+0x4f/0x60
[ 5381.660502] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660508] [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 5381.660515] [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
[ 5381.660520] [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
[ 5381.660524] [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
[ 5381.660530] [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
[ 5381.660536] [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
[ 5381.660560] [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660564] [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
[ 5381.660569] [<ffffffff811580ef>] krealloc+0x6f/0xb0
[ 5381.660586] [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
[ 5381.660601] [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
[ 5381.660615] [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
[ 5381.660628] [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]
Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:
$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ mkdir -p /mnt/btrfs/a/b/c/d
$ mkdir /mnt/btrfs/a/b/c2
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
$ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
$ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
$ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
$ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send
The structure of the filesystem when the first snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c (ino 259)
| |-- d (ino 260)
|
|-- c2 (ino 261)
And its structure when the second snapshot is taken is:
. (ino 256)
|-- a (ino 257)
|-- b (ino 258)
|-- c2 (ino 261)
|-- d2 (ino 260)
|-- cc (ino 259)
Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.
A test case for xfstests, with a more complex scenario, will follow soon.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-22 18:00:53 +08:00
|
|
|
WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
|
|
|
|
while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
|
|
|
|
struct rb_node *n;
|
|
|
|
struct pending_dir_move *pm;
|
|
|
|
|
|
|
|
n = rb_first(&sctx->pending_dir_moves);
|
|
|
|
pm = rb_entry(n, struct pending_dir_move, node);
|
|
|
|
while (!list_empty(&pm->list)) {
|
|
|
|
struct pending_dir_move *pm2;
|
|
|
|
|
|
|
|
pm2 = list_first_entry(&pm->list,
|
|
|
|
struct pending_dir_move, list);
|
|
|
|
free_pending_move(sctx, pm2);
|
|
|
|
}
|
|
|
|
free_pending_move(sctx, pm);
|
|
|
|
}
|
|
|
|
|
|
|
|
WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
|
|
|
|
while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
|
|
|
|
struct rb_node *n;
|
|
|
|
struct waiting_dir_move *dm;
|
|
|
|
|
|
|
|
n = rb_first(&sctx->waiting_dir_moves);
|
|
|
|
dm = rb_entry(n, struct waiting_dir_move, node);
|
|
|
|
rb_erase(&dm->node, &sctx->waiting_dir_moves);
|
|
|
|
kfree(dm);
|
|
|
|
}
|
|
|
|
|
2014-02-19 22:31:44 +08:00
|
|
|
WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
|
|
|
|
while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
|
|
|
|
struct rb_node *n;
|
|
|
|
struct orphan_dir_info *odi;
|
|
|
|
|
|
|
|
n = rb_first(&sctx->orphan_dirs);
|
|
|
|
odi = rb_entry(n, struct orphan_dir_info, node);
|
|
|
|
free_orphan_dir_info(sctx, odi);
|
|
|
|
}
|
|
|
|
|
2014-01-07 17:25:18 +08:00
|
|
|
if (sort_clone_roots) {
|
|
|
|
for (i = 0; i < sctx->clone_roots_cnt; i++)
|
|
|
|
btrfs_root_dec_send_in_progress(
|
|
|
|
sctx->clone_roots[i].root);
|
|
|
|
} else {
|
|
|
|
for (i = 0; sctx && i < clone_sources_to_rollback; i++)
|
|
|
|
btrfs_root_dec_send_in_progress(
|
|
|
|
sctx->clone_roots[i].root);
|
|
|
|
|
|
|
|
btrfs_root_dec_send_in_progress(send_root);
|
|
|
|
}
|
2013-12-17 22:07:20 +08:00
|
|
|
if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
|
|
|
|
btrfs_root_dec_send_in_progress(sctx->parent_root);
|
2013-12-17 00:34:17 +08:00
|
|
|
|
2012-07-26 05:19:24 +08:00
|
|
|
kfree(arg);
|
2016-04-12 00:40:08 +08:00
|
|
|
kvfree(clone_sources_tmp);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
if (sctx) {
|
|
|
|
if (sctx->send_filp)
|
|
|
|
fput(sctx->send_filp);
|
|
|
|
|
2016-04-12 00:40:08 +08:00
|
|
|
kvfree(sctx->clone_roots);
|
2016-04-12 00:40:08 +08:00
|
|
|
kvfree(sctx->send_buf);
|
2016-04-12 00:40:08 +08:00
|
|
|
kvfree(sctx->read_buf);
|
2012-07-26 05:19:24 +08:00
|
|
|
|
|
|
|
name_cache_free(sctx);
|
|
|
|
|
|
|
|
kfree(sctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|