UBI: handle write errors in WL worker

When a PEB is moved and a write error happens, UBI switches to R/O mode, which is wrong, because we just copy the data and may select a different PEB and re-try this. This patch fixes WL worker's behavior. Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
2024-11-11 21:38:32 +08:00 · 2008-12-05 13:37:02 +02:00 · 2008-12-05 13:37:02 +02:00 · 6fa6f5bbc3
commit 6fa6f5bbc3
parent 3c98b0a043
2 changed files with 44 additions and 25 deletions
--- a/drivers/mtd/ubi/eba.c
+++ b/drivers/mtd/ubi/eba.c
@ -949,10 +949,14 @@ write_error:
 * This function copies logical eraseblock from physical eraseblock @from to
 * physical eraseblock @to. The @vid_hdr buffer may be changed by this
 * function. Returns:
- *   o %0  in case of success;
- *   o %1 if the operation was canceled and should be tried later (e.g.,
- *     because a bit-flip was detected at the target PEB);
- *   o %2 if the volume is being deleted and this LEB should not be moved.
+ *   o %0 in case of success;
+ *   o %1 if the operation was canceled because the volume is being deleted
+ *        or because the PEB was put meanwhile;
+ *   o %2 if the operation was canceled because there was a write error to the
+ *        target PEB;
+ *   o %-EAGAIN if the operation was canceled because a bit-flip was detected
+ *     in the target PEB;
+ *   o a negative error code in case of failure.
 */
 int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
 		     struct ubi_vid_hdr *vid_hdr)
@ -978,7 +982,7 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
 	/*
 	 * Note, we may race with volume deletion, which means that the volume
 	 * this logical eraseblock belongs to might be being deleted. Since the
-	 * volume deletion unmaps all the volume's logical eraseblocks, it will
+	 * volume deletion un-maps all the volume's logical eraseblocks, it will
 	 * be locked in 'ubi_wl_put_peb()' and wait for the WL worker to finish.
 	 */
 	vol = ubi->volumes[idx];
@ -986,7 +990,7 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
 		/* No need to do further work, cancel */
 		dbg_eba("volume %d is being removed, cancel", vol_id);
 		spin_unlock(&ubi->volumes_lock);
-		return 2;
+		return 1;
 	}
 	spin_unlock(&ubi->volumes_lock);

@ -1023,7 +1027,7 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,

 	/*
 	 * OK, now the LEB is locked and we can safely start moving it. Since
-	 * this function utilizes thie @ubi->peb1_buf buffer which is shared
+	 * this function utilizes the @ubi->peb1_buf buffer which is shared
 	 * with some other functions, so lock the buffer by taking the
 	 * @ubi->buf_mutex.
 	 */
@ -1068,8 +1072,11 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
 	vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));

 	err = ubi_io_write_vid_hdr(ubi, to, vid_hdr);
-	if (err)
+	if (err) {
+		if (err == -EIO)
+			err = 2;
 		goto out_unlock_buf;
+	}

 	cond_resched();

@ -1079,14 +1086,17 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
 		if (err != UBI_IO_BITFLIPS)
 			ubi_warn("cannot read VID header back from PEB %d", to);
 		else
-			err = 1;
+			err = -EAGAIN;
 		goto out_unlock_buf;
 	}

 	if (data_size > 0) {
 		err = ubi_io_write_data(ubi, ubi->peb_buf1, to, 0, aldata_size);
-		if (err)
+		if (err) {
+			if (err == -EIO)
+				err = 2;
 			goto out_unlock_buf;
+		}

 		cond_resched();

@ -1101,15 +1111,16 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
 				ubi_warn("cannot read data back from PEB %d",
 					 to);
 			else
-				err = 1;
+				err = -EAGAIN;
 			goto out_unlock_buf;
 		}

 		cond_resched();

 		if (memcmp(ubi->peb_buf1, ubi->peb_buf2, aldata_size)) {
-			ubi_warn("read data back from PEB %d - it is different",
-				 to);
+			ubi_warn("read data back from PEB %d and it is "
+				 "different", to);
+			err = -EINVAL;
 			goto out_unlock_buf;
 		}
 	}
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@ -738,7 +738,7 @@ static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
 static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
 				int cancel)
 {
-	int err, scrubbing = 0;
+	int err, scrubbing = 0, torture = 0;
 	struct ubi_wl_prot_entry *uninitialized_var(pe);
 	struct ubi_wl_entry *e1, *e2;
 	struct ubi_vid_hdr *vid_hdr;
@ -842,20 +842,26 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,

 	err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr);
 	if (err) {
-
+		if (err == -EAGAIN)
+			goto out_not_moved;
 		if (err < 0)
 			goto out_error;
-		if (err == 1)
+		if (err == 2) {
+			/* Target PEB write error, torture it */
+			torture = 1;
 			goto out_not_moved;
+		}

 		/*
-		 * For some reason the LEB was not moved - it might be because
-		 * the volume is being deleted. We should prevent this PEB from
-		 * being selected for wear-levelling movement for some "time",
-		 * so put it to the protection tree.
+		 * The LEB has not been moved because the volume is being
+		 * deleted or the PEB has been put meanwhile. We should prevent
+		 * this PEB from being selected for wear-leveling movement
+		 * again, so put it to the protection tree.
 		 */

-		dbg_wl("cancelled moving PEB %d", e1->pnum);
+		dbg_wl("canceled moving PEB %d", e1->pnum);
+		ubi_assert(err == 1);
+
 		pe = kmalloc(sizeof(struct ubi_wl_prot_entry), GFP_NOFS);
 		if (!pe) {
 			err = -ENOMEM;
@ -920,9 +926,10 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
 	/*
 	 * For some reasons the LEB was not moved, might be an error, might be
 	 * something else. @e1 was not changed, so return it back. @e2 might
-	 * be changed, schedule it for erasure.
+	 * have been changed, schedule it for erasure.
 	 */
 out_not_moved:
+	dbg_wl("canceled moving PEB %d", e1->pnum);
 	ubi_free_vid_hdr(ubi, vid_hdr);
 	vid_hdr = NULL;
 	spin_lock(&ubi->wl_lock);
@ -930,12 +937,13 @@ out_not_moved:
 		wl_tree_add(e1, &ubi->scrub);
 	else
 		wl_tree_add(e1, &ubi->used);
+	ubi_assert(!ubi->move_to_put);
 	ubi->move_from = ubi->move_to = NULL;
-	ubi->move_to_put = ubi->wl_scheduled = 0;
+	ubi->wl_scheduled = 0;
 	spin_unlock(&ubi->wl_lock);

 	e1 = NULL;
-	err = schedule_erase(ubi, e2, 0);
+	err = schedule_erase(ubi, e2, torture);
 	if (err)
 		goto out_error;

@ -1324,7 +1332,7 @@ int ubi_wl_flush(struct ubi_device *ubi)
 	up_write(&ubi->work_sem);

 	/*
-	 * And in case last was the WL worker and it cancelled the LEB
+	 * And in case last was the WL worker and it canceled the LEB
 	 * movement, flush again.
 	 */
 	while (ubi->works_count) {