Merge branch 'for-3.14/drivers' of git://git.kernel.dk/linux-block

Pull block IO driver changes from Jens Axboe: - bcache update from Kent Overstreet. - two bcache fixes from Nicholas Swenson. - cciss pci init error fix from Andrew. - underflow fix in the parallel IDE pg_write code from Dan Carpenter. I'm sure the 1 (or 0) users of that are now happy. - two PCI related fixes for sx8 from Jingoo Han. - floppy init fix for first block read from Jiri Kosina. - pktcdvd error return miss fix from Julia Lawall. - removal of IRQF_SHARED from the SEGA Dreamcast CD-ROM code from Michael Opdenacker. - comment typo fix for the loop driver from Olaf Hering. - potential oops fix for null_blk from Raghavendra K T. - two fixes from Sam Bradshaw (Micron) for the mtip32xx driver, fixing an OOM problem and a problem with handling security locked conditions * 'for-3.14/drivers' of git://git.kernel.dk/linux-block: (47 commits) mg_disk: Spelling s/finised/finished/ null_blk: Null pointer deference problem in alloc_page_buffers mtip32xx: Correctly handle security locked condition mtip32xx: Make SGL container per-command to eliminate high order dma allocation drivers/block/loop.c: fix comment typo in loop_config_discard drivers/block/cciss.c:cciss_init_one(): use proper errnos drivers/block/paride/pg.c: underflow bug in pg_write() drivers/block/sx8.c: remove unnecessary pci_set_drvdata() drivers/block/sx8.c: use module_pci_driver() floppy: bail out in open() if drive is not responding to block0 read bcache: Fix auxiliary search trees for key size > cacheline size bcache: Don't return -EINTR when insert finished bcache: Improve bucket_prio() calculation bcache: Add bch_bkey_equal_header() bcache: update bch_bkey_try_merge bcache: Move insert_fixup() to btree_keys_ops bcache: Convert sorting to btree_keys bcache: Convert debug code to btree_keys bcache: Convert btree_iter to struct btree_keys bcache: Refactor bset_tree sysfs stats ...
2024-11-11 12:28:41 +08:00 · 2014-01-30 11:40:10 -08:00 · 2014-01-30 11:40:10 -08:00 · 53d8ab29f8
commit 53d8ab29f8
parent f568849eda 14424be4db
39 changed files with 2456 additions and 1916 deletions
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@ -592,6 +592,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		ret = -1;
 	}

+	t->raid_partial_stripes_expensive =
+		max(t->raid_partial_stripes_expensive,
+		    b->raid_partial_stripes_expensive);
+
 	/* Find lowest common alignment_offset */
 	t->alignment_offset = lcm(t->alignment_offset, alignment)
 		& (max(t->physical_block_size, t->io_min) - 1);
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@ -5004,7 +5004,7 @@ reinit_after_soft_reset:

 	i = alloc_cciss_hba(pdev);
 	if (i < 0)
-		return -1;
+		return -ENOMEM;

 	h = hba[i];
 	h->pdev = pdev;
@ -5205,7 +5205,7 @@ clean_no_release_regions:
 	 */
 	pci_set_drvdata(pdev, NULL);
 	free_hba(h);
-	return -1;
+	return -ENODEV;
 }

 static void cciss_shutdown(struct pci_dev *pdev)
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@ -3691,9 +3691,12 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	if (!(mode & FMODE_NDELAY)) {
 		if (mode & (FMODE_READ|FMODE_WRITE)) {
 			UDRS->last_checked = 0;
+			clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
 			check_disk_change(bdev);
 			if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags))
 				goto out;
+			if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags))
+				goto out;
 		}
 		res = -EROFS;
 		if ((mode & FMODE_WRITE) &&
@ -3746,17 +3749,29 @@ static unsigned int floppy_check_events(struct gendisk *disk,
 * a disk in the drive, and whether that disk is writable.
 */

-static void floppy_rb0_complete(struct bio *bio, int err)
+struct rb0_cbdata {
+	int drive;
+	struct completion complete;
+};
+
+static void floppy_rb0_cb(struct bio *bio, int err)
 {
-	complete((struct completion *)bio->bi_private);
+	struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private;
+	int drive = cbdata->drive;
+
+	if (err) {
+		pr_info("floppy: error %d while reading block 0", err);
+		set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
+	}
+	complete(&cbdata->complete);
 }

-static int __floppy_read_block_0(struct block_device *bdev)
+static int __floppy_read_block_0(struct block_device *bdev, int drive)
 {
 	struct bio bio;
 	struct bio_vec bio_vec;
-	struct completion complete;
 	struct page *page;
+	struct rb0_cbdata cbdata;
 	size_t size;

 	page = alloc_page(GFP_NOIO);
@ -3769,6 +3784,8 @@ static int __floppy_read_block_0(struct block_device *bdev)
 	if (!size)
 		size = 1024;

+	cbdata.drive = drive;
+
 	bio_init(&bio);
 	bio.bi_io_vec = &bio_vec;
 	bio_vec.bv_page = page;
@ -3779,13 +3796,14 @@ static int __floppy_read_block_0(struct block_device *bdev)
 	bio.bi_bdev = bdev;
 	bio.bi_iter.bi_sector = 0;
 	bio.bi_flags = (1 << BIO_QUIET);
-	init_completion(&complete);
-	bio.bi_private = &complete;
-	bio.bi_end_io = floppy_rb0_complete;
+	bio.bi_private = &cbdata;
+	bio.bi_end_io = floppy_rb0_cb;

 	submit_bio(READ, &bio);
 	process_fd_request();
-	wait_for_completion(&complete);
+
+	init_completion(&cbdata.complete);
+	wait_for_completion(&cbdata.complete);

 	__free_page(page);

@ -3827,7 +3845,7 @@ static int floppy_revalidate(struct gendisk *disk)
 			UDRS->generation++;
 		if (drive_no_geom(drive)) {
 			/* auto-sensing */
-			res = __floppy_read_block_0(opened_bdev[drive]);
+			res = __floppy_read_block_0(opened_bdev[drive], drive);
 		} else {
 			if (cf)
 				poll_drive(false, FD_RAW_NEED_DISK);
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@ -799,7 +799,7 @@ static void loop_config_discard(struct loop_device *lo)

 	/*
 	 * We use punch hole to reclaim the free space used by the
-	 * image a.k.a. discard. However we do support discard if
+	 * image a.k.a. discard. However we do not support discard if
 	 * encryption is enabled, because it may give an attacker
 	 * useful information.
 	 */
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@ -915,7 +915,7 @@ static int mg_probe(struct platform_device *plat_dev)

 	/* disk reset */
 	if (prv_data->dev_attr == MG_STORAGE_DEV) {
-		/* If POR seq. not yet finised, wait */
+		/* If POR seq. not yet finished, wait */
 		err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
 		if (err)
 			goto probe_err_3b;
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@ -41,10 +41,31 @@
 #include "mtip32xx.h"

 #define HW_CMD_SLOT_SZ		(MTIP_MAX_COMMAND_SLOTS * 32)
-#define HW_CMD_TBL_SZ		(AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16))
-#define HW_CMD_TBL_AR_SZ	(HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS)
-#define HW_PORT_PRIV_DMA_SZ \
-		(HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ)
+
+/* DMA region containing RX Fis, Identify, RLE10, and SMART buffers */
+#define AHCI_RX_FIS_SZ          0x100
+#define AHCI_RX_FIS_OFFSET      0x0
+#define AHCI_IDFY_SZ            ATA_SECT_SIZE
+#define AHCI_IDFY_OFFSET        0x400
+#define AHCI_SECTBUF_SZ         ATA_SECT_SIZE
+#define AHCI_SECTBUF_OFFSET     0x800
+#define AHCI_SMARTBUF_SZ        ATA_SECT_SIZE
+#define AHCI_SMARTBUF_OFFSET    0xC00
+/* 0x100 + 0x200 + 0x200 + 0x200 is smaller than 4k but we pad it out */
+#define BLOCK_DMA_ALLOC_SZ      4096
+
+/* DMA region containing command table (should be 8192 bytes) */
+#define AHCI_CMD_SLOT_SZ        sizeof(struct mtip_cmd_hdr)
+#define AHCI_CMD_TBL_SZ         (MTIP_MAX_COMMAND_SLOTS * AHCI_CMD_SLOT_SZ)
+#define AHCI_CMD_TBL_OFFSET     0x0
+
+/* DMA region per command (contains header and SGL) */
+#define AHCI_CMD_TBL_HDR_SZ     0x80
+#define AHCI_CMD_TBL_HDR_OFFSET 0x0
+#define AHCI_CMD_TBL_SGL_SZ     (MTIP_MAX_SG * sizeof(struct mtip_cmd_sg))
+#define AHCI_CMD_TBL_SGL_OFFSET AHCI_CMD_TBL_HDR_SZ
+#define CMD_DMA_ALLOC_SZ        (AHCI_CMD_TBL_SGL_SZ + AHCI_CMD_TBL_HDR_SZ)
+

 #define HOST_CAP_NZDMA		(1 << 19)
 #define HOST_HSORG		0xFC
@ -899,8 +920,9 @@ static void mtip_handle_tfe(struct driver_data *dd)
 			fail_reason = "thermal shutdown";
 		}
 		if (buf[288] == 0xBF) {
+			set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
 			dev_info(&dd->pdev->dev,
-				"Drive indicates rebuild has failed.\n");
+				"Drive indicates rebuild has failed. Secure erase required.\n");
 			fail_all_ncq_cmds = 1;
 			fail_reason = "rebuild failed";
 		}
@ -1566,6 +1588,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
 	}
 #endif

+	/* Check security locked state */
+	if (port->identify[128] & 0x4)
+		set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+	else
+		clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+
 #ifdef MTIP_TRIM /* Disabling TRIM support temporarily */
 	/* Demux ID.DRAT & ID.RZAT to determine trim support */
 	if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5))
@ -1887,6 +1915,10 @@ static void mtip_dump_identify(struct mtip_port *port)
 	strlcpy(cbuf, (char *)(port->identify+27), 41);
 	dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);

+	dev_info(&port->dd->pdev->dev, "Security: %04x %s\n",
+		port->identify[128],
+		port->identify[128] & 0x4 ? "(LOCKED)" : "");
+
 	if (mtip_hw_get_capacity(port->dd, &sectors))
 		dev_info(&port->dd->pdev->dev,
 			"Capacity: %llu sectors (%llu MB)\n",
@ -3312,6 +3344,118 @@ st_out:
 	return 0;
 }

+/*
+ * DMA region teardown
+ *
+ * @dd Pointer to driver_data structure
+ *
+ * return value
+ *      None
+ */
+static void mtip_dma_free(struct driver_data *dd)
+{
+	int i;
+	struct mtip_port *port = dd->port;
+
+	if (port->block1)
+		dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+					port->block1, port->block1_dma);
+
+	if (port->command_list) {
+		dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+				port->command_list, port->command_list_dma);
+	}
+
+	for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) {
+		if (port->commands[i].command)
+			dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+				port->commands[i].command,
+				port->commands[i].command_dma);
+	}
+}
+
+/*
+ * DMA region setup
+ *
+ * @dd Pointer to driver_data structure
+ *
+ * return value
+ *      -ENOMEM Not enough free DMA region space to initialize driver
+ */
+static int mtip_dma_alloc(struct driver_data *dd)
+{
+	struct mtip_port *port = dd->port;
+	int i, rv = 0;
+	u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
+
+	/* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
+	port->block1 =
+		dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+					&port->block1_dma, GFP_KERNEL);
+	if (!port->block1)
+		return -ENOMEM;
+	memset(port->block1, 0, BLOCK_DMA_ALLOC_SZ);
+
+	/* Allocate dma memory for command list */
+	port->command_list =
+		dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+					&port->command_list_dma, GFP_KERNEL);
+	if (!port->command_list) {
+		dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+					port->block1, port->block1_dma);
+		port->block1 = NULL;
+		port->block1_dma = 0;
+		return -ENOMEM;
+	}
+	memset(port->command_list, 0, AHCI_CMD_TBL_SZ);
+
+	/* Setup all pointers into first DMA region */
+	port->rxfis         = port->block1 + AHCI_RX_FIS_OFFSET;
+	port->rxfis_dma     = port->block1_dma + AHCI_RX_FIS_OFFSET;
+	port->identify      = port->block1 + AHCI_IDFY_OFFSET;
+	port->identify_dma  = port->block1_dma + AHCI_IDFY_OFFSET;
+	port->log_buf       = port->block1 + AHCI_SECTBUF_OFFSET;
+	port->log_buf_dma   = port->block1_dma + AHCI_SECTBUF_OFFSET;
+	port->smart_buf     = port->block1 + AHCI_SMARTBUF_OFFSET;
+	port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET;
+
+	/* Setup per command SGL DMA region */
+
+	/* Point the command headers at the command tables */
+	for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) {
+		port->commands[i].command =
+			dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+				&port->commands[i].command_dma, GFP_KERNEL);
+		if (!port->commands[i].command) {
+			rv = -ENOMEM;
+			mtip_dma_free(dd);
+			return rv;
+		}
+		memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ);
+
+		port->commands[i].command_header = port->command_list +
+					(sizeof(struct mtip_cmd_hdr) * i);
+		port->commands[i].command_header_dma =
+					dd->port->command_list_dma +
+					(sizeof(struct mtip_cmd_hdr) * i);
+
+		if (host_cap_64)
+			port->commands[i].command_header->ctbau =
+				__force_bit2int cpu_to_le32(
+				(port->commands[i].command_dma >> 16) >> 16);
+
+		port->commands[i].command_header->ctba =
+				__force_bit2int cpu_to_le32(
+				port->commands[i].command_dma & 0xFFFFFFFF);
+
+		sg_init_table(port->commands[i].sg, MTIP_MAX_SG);
+
+		/* Mark command as currently inactive */
+		atomic_set(&dd->port->commands[i].active, 0);
+	}
+	return 0;
+}
+
 /*
 * Called once for each card.
 *
@ -3370,83 +3514,10 @@ static int mtip_hw_init(struct driver_data *dd)
 	dd->port->mmio	= dd->mmio + PORT_OFFSET;
 	dd->port->dd	= dd;

-	/* Allocate memory for the command list. */
-	dd->port->command_list =
-		dmam_alloc_coherent(&dd->pdev->dev,
-			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
-			&dd->port->command_list_dma,
-			GFP_KERNEL);
-	if (!dd->port->command_list) {
-		dev_err(&dd->pdev->dev,
-			"Memory allocation: command list\n");
-		rv = -ENOMEM;
+	/* DMA allocations */
+	rv = mtip_dma_alloc(dd);
+	if (rv < 0)
 		goto out1;
-	}
-
-	/* Clear the memory we have allocated. */
-	memset(dd->port->command_list,
-		0,
-		HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4));
-
-	/* Setup the addresse of the RX FIS. */
-	dd->port->rxfis	    = dd->port->command_list + HW_CMD_SLOT_SZ;
-	dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ;
-
-	/* Setup the address of the command tables. */
-	dd->port->command_table	  = dd->port->rxfis + AHCI_RX_FIS_SZ;
-	dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ;
-
-	/* Setup the address of the identify data. */
-	dd->port->identify     = dd->port->command_table +
-					HW_CMD_TBL_AR_SZ;
-	dd->port->identify_dma = dd->port->command_tbl_dma +
-					HW_CMD_TBL_AR_SZ;
-
-	/* Setup the address of the sector buffer - for some non-ncq cmds */
-	dd->port->sector_buffer	= (void *) dd->port->identify + ATA_SECT_SIZE;
-	dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE;
-
-	/* Setup the address of the log buf - for read log command */
-	dd->port->log_buf = (void *)dd->port->sector_buffer  + ATA_SECT_SIZE;
-	dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE;
-
-	/* Setup the address of the smart buf - for smart read data command */
-	dd->port->smart_buf = (void *)dd->port->log_buf  + ATA_SECT_SIZE;
-	dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE;
-
-
-	/* Point the command headers at the command tables. */
-	for (i = 0; i < num_command_slots; i++) {
-		dd->port->commands[i].command_header =
-					dd->port->command_list +
-					(sizeof(struct mtip_cmd_hdr) * i);
-		dd->port->commands[i].command_header_dma =
-					dd->port->command_list_dma +
-					(sizeof(struct mtip_cmd_hdr) * i);
-
-		dd->port->commands[i].command =
-			dd->port->command_table + (HW_CMD_TBL_SZ * i);
-		dd->port->commands[i].command_dma =
-			dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i);
-
-		if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64)
-			dd->port->commands[i].command_header->ctbau =
-			__force_bit2int cpu_to_le32(
-			(dd->port->commands[i].command_dma >> 16) >> 16);
-		dd->port->commands[i].command_header->ctba =
-			__force_bit2int cpu_to_le32(
-			dd->port->commands[i].command_dma & 0xFFFFFFFF);
-
-		/*
-		 * If this is not done, a bug is reported by the stock
-		 * FC11 i386. Due to the fact that it has lots of kernel
-		 * debugging enabled.
-		 */
-		sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG);
-
-		/* Mark all commands as currently inactive.*/
-		atomic_set(&dd->port->commands[i].active, 0);
-	}

 	/* Setup the pointers to the extended s_active and CI registers. */
 	for (i = 0; i < dd->slot_groups; i++) {
@ -3594,12 +3665,8 @@ out3:

 out2:
 	mtip_deinit_port(dd->port);
+	mtip_dma_free(dd);

-	/* Free the command/command header memory. */
-	dmam_free_coherent(&dd->pdev->dev,
-				HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
-				dd->port->command_list,
-				dd->port->command_list_dma);
 out1:
 	/* Free the memory allocated for the for structure. */
 	kfree(dd->port);
@ -3622,7 +3689,8 @@ static int mtip_hw_exit(struct driver_data *dd)
 	 * saves its state.
 	 */
 	if (!dd->sr) {
-		if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
+		if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) &&
+		    !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
 			if (mtip_standby_immediate(dd->port))
 				dev_warn(&dd->pdev->dev,
 					"STANDBY IMMEDIATE failed\n");
@ -3641,11 +3709,9 @@ static int mtip_hw_exit(struct driver_data *dd)
 	irq_set_affinity_hint(dd->pdev->irq, NULL);
 	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);

-	/* Free the command/command header memory. */
-	dmam_free_coherent(&dd->pdev->dev,
-			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
-			dd->port->command_list,
-			dd->port->command_list_dma);
+	/* Free dma regions */
+	mtip_dma_free(dd);
+
 	/* Free the memory allocated for the for structure. */
 	kfree(dd->port);
 	dd->port = NULL;
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@ -69,7 +69,7 @@
 * Maximum number of scatter gather entries
 * a single command may have.
 */
-#define MTIP_MAX_SG		128
+#define MTIP_MAX_SG		504

 /*
 * Maximum number of slot groups (Command Issue & s_active registers)
@ -92,7 +92,7 @@

 /* Driver name and version strings */
 #define MTIP_DRV_NAME		"mtip32xx"
-#define MTIP_DRV_VERSION	"1.2.6os3"
+#define MTIP_DRV_VERSION	"1.3.0"

 /* Maximum number of minor device numbers per device. */
 #define MTIP_MAX_MINORS		16
@ -391,15 +391,13 @@ struct mtip_port {
 	 */
 	dma_addr_t rxfis_dma;
 	/*
-	 * Pointer to the beginning of the command table memory as used
-	 * by the driver.
+	 * Pointer to the DMA region for RX Fis, Identify, RLE10, and SMART
 	 */
-	void *command_table;
+	void *block1;
 	/*
-	 * Pointer to the beginning of the command table memory as used
-	 * by the DMA.
+	 * DMA address of region for RX Fis, Identify, RLE10, and SMART
 	 */
-	dma_addr_t command_tbl_dma;
+	dma_addr_t block1_dma;
 	/*
 	 * Pointer to the beginning of the identify data memory as used
 	 * by the driver.
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@ -616,6 +616,11 @@ static int __init null_init(void)
 		irqmode = NULL_IRQ_NONE;
 	}
 #endif
+	if (bs > PAGE_SIZE) {
+		pr_warn("null_blk: invalid block size\n");
+		pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
+		bs = PAGE_SIZE;
+	}

 	if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
 		if (submit_queues < nr_online_nodes) {
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@ -581,7 +581,7 @@ static ssize_t pg_write(struct file *filp, const char __user *buf, size_t count,

 	if (hdr.magic != PG_MAGIC)
 		return -EINVAL;
-	if (hdr.dlen > PG_MAX_DATA)
+	if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA)
 		return -EINVAL;
 	if ((count - hs) > PG_MAX_DATA)
 		return -EINVAL;
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@ -706,7 +706,9 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 			     WRITE : READ, __GFP_WAIT);

 	if (cgc->buflen) {
-		if (blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, __GFP_WAIT))
+		ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
+				      __GFP_WAIT);
+		if (ret)
 			goto out;
 	}

--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@ -1744,20 +1744,6 @@ static void carm_remove_one (struct pci_dev *pdev)
 	kfree(host);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
-	pci_set_drvdata(pdev, NULL);
 }

-static int __init carm_init(void)
-{
-	return pci_register_driver(&carm_driver);
-}
-
-static void __exit carm_exit(void)
-{
-	pci_unregister_driver(&carm_driver);
-}
-
-module_init(carm_init);
-module_exit(carm_exit);
-
-
+module_pci_driver(carm_driver);
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@ -561,11 +561,11 @@ static int gdrom_set_interrupt_handlers(void)
 	int err;

 	err = request_irq(HW_EVENT_GDROM_CMD, gdrom_command_interrupt,
-		IRQF_DISABLED, "gdrom_command", &gd);
+		0, "gdrom_command", &gd);
 	if (err)
 		return err;
 	err = request_irq(HW_EVENT_GDROM_DMA, gdrom_dma_interrupt,
-		IRQF_DISABLED, "gdrom_dma", &gd);
+		0, "gdrom_dma", &gd);
 	if (err)
 		free_irq(HW_EVENT_GDROM_CMD, &gd);
 	return err;
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@ -1,7 +1,8 @@

 obj-$(CONFIG_BCACHE)	+= bcache.o

-bcache-y		:= alloc.o btree.o bset.o io.o journal.o writeback.o\
-	movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
+bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
+	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+	util.o writeback.o

 CFLAGS_request.o	+= -Iblock
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
 {
 	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));

-	if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
-	    CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
-		return false;
+	if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
+		unsigned i;

+		for (i = 0; i < RESERVE_NONE; i++)
+			if (!fifo_full(&ca->free[i]))
+				goto add;
+
+		return false;
+	}
+add:
 	b->prio = 0;

 	if (can_inc_bucket_gen(b) &&
@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
 	fifo_push(&ca->free_inc, b - ca->buckets);
 }

+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_prio()
+ * first: we also take into account the number of sectors of live data in that
+ * bucket, and in order for that multiply to make sense we have to scale bucket
+ *
+ * Thus, we scale the bucket priorities so that the bucket with the smallest
+ * prio is worth 1/8th of what INITIAL_PRIO is worth.
+ */
+
 #define bucket_prio(b)							\
-	(((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
+({									\
+	unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;	\
+									\
+	(b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);	\
+})

 #define bucket_max_cmp(l, r)	(bucket_prio(l) < bucket_prio(r))
 #define bucket_min_cmp(l, r)	(bucket_prio(l) > bucket_prio(r))
@ -304,6 +323,21 @@ do {									\
 	__set_current_state(TASK_RUNNING);				\
 } while (0)

+static int bch_allocator_push(struct cache *ca, long bucket)
+{
+	unsigned i;
+
+	/* Prios/gens are actually the most important reserve */
+	if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+		return true;
+
+	for (i = 0; i < RESERVE_NR; i++)
+		if (fifo_push(&ca->free[i], bucket))
+			return true;
+
+	return false;
+}
+
 static int bch_allocator_thread(void *arg)
 {
 	struct cache *ca = arg;
@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg)
 				mutex_lock(&ca->set->bucket_lock);
 			}

-			allocator_wait(ca, !fifo_full(&ca->free));
-
-			fifo_push(&ca->free, bucket);
+			allocator_wait(ca, bch_allocator_push(ca, bucket));
 			wake_up(&ca->set->bucket_wait);
 		}

@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg)
 	}
 }

-long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
+long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 {
 	DEFINE_WAIT(w);
 	struct bucket *b;
 	long r;

 	/* fastpath */
-	if (fifo_used(&ca->free) > ca->watermark[watermark]) {
-		fifo_pop(&ca->free, r);
+	if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
+	    fifo_pop(&ca->free[reserve], r))
 		goto out;
-	}

 	if (!wait)
 		return -1;

-	while (1) {
-		if (fifo_used(&ca->free) > ca->watermark[watermark]) {
-			fifo_pop(&ca->free, r);
-			break;
-		}
-
+	do {
 		prepare_to_wait(&ca->set->bucket_wait, &w,
 				TASK_UNINTERRUPTIBLE);

 		mutex_unlock(&ca->set->bucket_lock);
 		schedule();
 		mutex_lock(&ca->set->bucket_lock);
-	}
+	} while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
+		 !fifo_pop(&ca->free[reserve], r));

 	finish_wait(&ca->set->bucket_wait, &w);
 out:
@ -401,11 +428,13 @@ out:
 	if (expensive_debug_checks(ca->set)) {
 		size_t iter;
 		long i;
+		unsigned j;

 		for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
 			BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);

-		fifo_for_each(i, &ca->free, iter)
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each(i, &ca->free[j], iter)
 				BUG_ON(i == r);
 		fifo_for_each(i, &ca->free_inc, iter)
 			BUG_ON(i == r);
@ -419,7 +448,7 @@ out:

 	SET_GC_SECTORS_USED(b, ca->sb.bucket_size);

-	if (watermark <= WATERMARK_METADATA) {
+	if (reserve <= RESERVE_PRIO) {
 		SET_GC_MARK(b, GC_MARK_METADATA);
 		SET_GC_MOVE(b, 0);
 		b->prio = BTREE_PRIO;
@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
 	}
 }

-int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
 			   struct bkey *k, int n, bool wait)
 {
 	int i;
@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,

 	for (i = 0; i < n; i++) {
 		struct cache *ca = c->cache_by_alloc[i];
-		long b = bch_bucket_alloc(ca, watermark, wait);
+		long b = bch_bucket_alloc(ca, reserve, wait);

 		if (b == -1)
 			goto err;
@ -478,12 +507,12 @@ err:
 	return -1;
 }

-int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
 			 struct bkey *k, int n, bool wait)
 {
 	int ret;
 	mutex_lock(&c->bucket_lock);
-	ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
+	ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
 	mutex_unlock(&c->bucket_lock);
 	return ret;
 }
@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,

 	while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
 		unsigned watermark = write_prio
-			? WATERMARK_MOVINGGC
-			: WATERMARK_NONE;
+			? RESERVE_MOVINGGC
+			: RESERVE_NONE;

 		spin_unlock(&c->data_bucket_lock);

@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca)
 	 * Then 8 for btree allocations
 	 * Then half for the moving garbage collector
 	 */
-
+#if 0
 	ca->watermark[WATERMARK_PRIO] = 0;

 	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca)

 	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
 		ca->watermark[WATERMARK_MOVINGGC];
-
+#endif
 	return 0;
 }
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@ -187,6 +187,7 @@
 #include <linux/types.h>
 #include <linux/workqueue.h>

+#include "bset.h"
 #include "util.h"
 #include "closure.h"

@ -309,7 +310,8 @@ struct cached_dev {
 	struct cache_sb		sb;
 	struct bio		sb_bio;
 	struct bio_vec		sb_bv[1];
-	struct closure_with_waitlist sb_write;
+	struct closure		sb_write;
+	struct semaphore	sb_write_mutex;

 	/* Refcount on the cache set. Always nonzero when we're caching. */
 	atomic_t		count;
@ -382,12 +384,12 @@ struct cached_dev {
 	unsigned		writeback_rate_p_term_inverse;
 };

-enum alloc_watermarks {
-	WATERMARK_PRIO,
-	WATERMARK_METADATA,
-	WATERMARK_MOVINGGC,
-	WATERMARK_NONE,
-	WATERMARK_MAX
+enum alloc_reserve {
+	RESERVE_BTREE,
+	RESERVE_PRIO,
+	RESERVE_MOVINGGC,
+	RESERVE_NONE,
+	RESERVE_NR,
 };

 struct cache {
@ -399,8 +401,6 @@ struct cache {
 	struct kobject		kobj;
 	struct block_device	*bdev;

-	unsigned		watermark[WATERMARK_MAX];
-
 	struct task_struct	*alloc_thread;

 	struct closure		prio;
@ -429,7 +429,7 @@ struct cache {
 	 * because all the data they contained was overwritten), so we only
 	 * need to discard them before they can be moved to the free list.
 	 */
-	DECLARE_FIFO(long, free);
+	DECLARE_FIFO(long, free)[RESERVE_NR];
 	DECLARE_FIFO(long, free_inc);
 	DECLARE_FIFO(long, unused);

@ -514,7 +514,8 @@ struct cache_set {
 	uint64_t		cached_dev_sectors;
 	struct closure		caching;

-	struct closure_with_waitlist sb_write;
+	struct closure		sb_write;
+	struct semaphore	sb_write_mutex;

 	mempool_t		*search;
 	mempool_t		*bio_meta;
@ -629,13 +630,15 @@ struct cache_set {

 #ifdef CONFIG_BCACHE_DEBUG
 	struct btree		*verify_data;
+	struct bset		*verify_ondisk;
 	struct mutex		verify_lock;
 #endif

 	unsigned		nr_uuids;
 	struct uuid_entry	*uuids;
 	BKEY_PADDED(uuid_bucket);
-	struct closure_with_waitlist uuid_write;
+	struct closure		uuid_write;
+	struct semaphore	uuid_write_mutex;

 	/*
 	 * A btree node on disk could have too many bsets for an iterator to fit
@ -643,13 +646,7 @@ struct cache_set {
 	 */
 	mempool_t		*fill_iter;

-	/*
-	 * btree_sort() is a merge sort and requires temporary space - single
-	 * element mempool
-	 */
-	struct mutex		sort_lock;
-	struct bset		*sort;
-	unsigned		sort_crit_factor;
+	struct bset_sort_state	sort;

 	/* List of buckets we're currently writing data to */
 	struct list_head	data_buckets;
@ -665,7 +662,6 @@ struct cache_set {
 	unsigned		congested_read_threshold_us;
 	unsigned		congested_write_threshold_us;

-	struct time_stats	sort_time;
 	struct time_stats	btree_gc_time;
 	struct time_stats	btree_split_time;
 	struct time_stats	btree_read_time;
@ -683,9 +679,9 @@ struct cache_set {
 	unsigned		error_decay;

 	unsigned short		journal_delay_ms;
+	bool			expensive_debug_checks;
 	unsigned		verify:1;
 	unsigned		key_merging_disabled:1;
-	unsigned		expensive_debug_checks:1;
 	unsigned		gc_always_rewrite:1;
 	unsigned		shrinker_disabled:1;
 	unsigned		copy_gc_enabled:1;
@ -707,13 +703,8 @@ struct bbio {
 	struct bio		bio;
 };

-static inline unsigned local_clock_us(void)
-{
-	return local_clock() >> 10;
-}
-
 #define BTREE_PRIO		USHRT_MAX
-#define INITIAL_PRIO		32768
+#define INITIAL_PRIO		32768U

 #define btree_bytes(c)		((c)->btree_pages * PAGE_SIZE)
 #define btree_blocks(b)							\
@ -726,21 +717,6 @@ static inline unsigned local_clock_us(void)
 #define bucket_bytes(c)		((c)->sb.bucket_size << 9)
 #define block_bytes(c)		((c)->sb.block_size << 9)

-#define __set_bytes(i, k)	(sizeof(*(i)) + (k) * sizeof(uint64_t))
-#define set_bytes(i)		__set_bytes(i, i->keys)
-
-#define __set_blocks(i, k, c)	DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
-#define set_blocks(i, c)	__set_blocks(i, (i)->keys, c)
-
-#define node(i, j)		((struct bkey *) ((i)->d + (j)))
-#define end(i)			node(i, (i)->keys)
-
-#define index(i, b)							\
-	((size_t) (((void *) i - (void *) (b)->sets[0].data) /		\
-		   block_bytes(b->c)))
-
-#define btree_data_space(b)	(PAGE_SIZE << (b)->page_order)
-
 #define prios_per_bucket(c)				\
 	((bucket_bytes(c) - sizeof(struct prio_set)) /	\
 	 sizeof(struct bucket_disk))
@ -783,20 +759,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
 	return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
 }

-/* Btree key macros */
-
-static inline void bkey_init(struct bkey *k)
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
 {
-	*k = ZERO_KEY;
+	uint8_t r = a - b;
+	return r > 128U ? 0 : r;
 }

+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+				unsigned i)
+{
+	return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
+}
+
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+				 unsigned i)
+{
+	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+
+/* Btree key macros */
+
 /*
 * This is used for various on disk data structures - cache_sb, prio_set, bset,
 * jset: The checksum is _always_ the first 8 bytes of these structs
 */
 #define csum_set(i)							\
 	bch_crc64(((void *) (i)) + sizeof(uint64_t),			\
-	      ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
+		  ((void *) bset_bkey_last(i)) -			\
+		  (((void *) (i)) + sizeof(uint64_t)))

 /* Error handling macros */

--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@ -1,7 +1,11 @@
 #ifndef _BCACHE_BSET_H
 #define _BCACHE_BSET_H

-#include <linux/slab.h>
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "util.h" /* for time_stats */

 /*
 * BKEYS:
@ -142,20 +146,13 @@
 * first key in that range of bytes again.
 */

-/* Btree key comparison/iteration */
+struct btree_keys;
+struct btree_iter;
+struct btree_iter_set;
+struct bkey_float;

 #define MAX_BSETS		4U

-struct btree_iter {
-	size_t size, used;
-#ifdef CONFIG_BCACHE_DEBUG
-	struct btree *b;
-#endif
-	struct btree_iter_set {
-		struct bkey *k, *end;
-	} data[MAX_BSETS];
-};
-
 struct bset_tree {
 	/*
 	 * We construct a binary tree in an array as if the array
@ -188,6 +185,213 @@ struct bset_tree {
 	struct bset		*data;
 };

+struct btree_keys_ops {
+	bool		(*sort_cmp)(struct btree_iter_set,
+				    struct btree_iter_set);
+	struct bkey	*(*sort_fixup)(struct btree_iter *, struct bkey *);
+	bool		(*insert_fixup)(struct btree_keys *, struct bkey *,
+					struct btree_iter *, struct bkey *);
+	bool		(*key_invalid)(struct btree_keys *,
+				       const struct bkey *);
+	bool		(*key_bad)(struct btree_keys *, const struct bkey *);
+	bool		(*key_merge)(struct btree_keys *,
+				     struct bkey *, struct bkey *);
+	void		(*key_to_text)(char *, size_t, const struct bkey *);
+	void		(*key_dump)(struct btree_keys *, const struct bkey *);
+
+	/*
+	 * Only used for deciding whether to use START_KEY(k) or just the key
+	 * itself in a couple places
+	 */
+	bool		is_extents;
+};
+
+struct btree_keys {
+	const struct btree_keys_ops	*ops;
+	uint8_t			page_order;
+	uint8_t			nsets;
+	unsigned		last_set_unwritten:1;
+	bool			*expensive_debug_checks;
+
+	/*
+	 * Sets of sorted keys - the real btree node - plus a binary search tree
+	 *
+	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+	 * to the memory we have allocated for this btree node. Additionally,
+	 * set[0]->data points to the entire btree node as it exists on disk.
+	 */
+	struct bset_tree	set[MAX_BSETS];
+};
+
+static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
+{
+	return b->set + b->nsets;
+}
+
+static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
+{
+	return t <= b->set + b->nsets - b->last_set_unwritten;
+}
+
+static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
+{
+	return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
+}
+
+static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
+{
+	return ((size_t) i) - ((size_t) b->set->data);
+}
+
+static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
+{
+	return bset_byte_offset(b, i) >> 9;
+}
+
+#define __set_bytes(i, k)	(sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i)		__set_bytes(i, i->keys)
+
+#define __set_blocks(i, k, block_bytes)				\
+	DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
+#define set_blocks(i, block_bytes)				\
+	__set_blocks(i, (i)->keys, block_bytes)
+
+static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
+{
+	struct bset_tree *t = bset_tree_last(b);
+
+	BUG_ON((PAGE_SIZE << b->page_order) <
+	       (bset_byte_offset(b, t->data) + set_bytes(t->data)));
+
+	if (!b->last_set_unwritten)
+		return 0;
+
+	return ((PAGE_SIZE << b->page_order) -
+		(bset_byte_offset(b, t->data) + set_bytes(t->data))) /
+		sizeof(u64);
+}
+
+static inline struct bset *bset_next_set(struct btree_keys *b,
+					 unsigned block_bytes)
+{
+	struct bset *i = bset_tree_last(b)->data;
+
+	return ((void *) i) + roundup(set_bytes(i), block_bytes);
+}
+
+void bch_btree_keys_free(struct btree_keys *);
+int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
+void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
+			 bool *);
+
+void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
+void bch_bset_build_written_tree(struct btree_keys *);
+void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
+bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
+void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
+unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
+			      struct bkey *);
+
+enum {
+	BTREE_INSERT_STATUS_NO_INSERT = 0,
+	BTREE_INSERT_STATUS_INSERT,
+	BTREE_INSERT_STATUS_BACK_MERGE,
+	BTREE_INSERT_STATUS_OVERWROTE,
+	BTREE_INSERT_STATUS_FRONT_MERGE,
+};
+
+/* Btree key iteration */
+
+struct btree_iter {
+	size_t size, used;
+#ifdef CONFIG_BCACHE_DEBUG
+	struct btree_keys *b;
+#endif
+	struct btree_iter_set {
+		struct bkey *k, *end;
+	} data[MAX_BSETS];
+};
+
+typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
+
+struct bkey *bch_btree_iter_next(struct btree_iter *);
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
+					struct btree_keys *, ptr_filter_fn);
+
+void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
+				 struct bkey *);
+
+struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
+			       const struct bkey *);
+
+/*
+ * Returns the first key that is strictly greater than search
+ */
+static inline struct bkey *bch_bset_search(struct btree_keys *b,
+					   struct bset_tree *t,
+					   const struct bkey *search)
+{
+	return search ? __bch_bset_search(b, t, search) : t->data->start;
+}
+
+#define for_each_key_filter(b, k, iter, filter)				\
+	for (bch_btree_iter_init((b), (iter), NULL);			\
+	     ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+
+#define for_each_key(b, k, iter)					\
+	for (bch_btree_iter_init((b), (iter), NULL);			\
+	     ((k) = bch_btree_iter_next(iter));)
+
+/* Sorting */
+
+struct bset_sort_state {
+	mempool_t		*pool;
+
+	unsigned		page_order;
+	unsigned		crit_factor;
+
+	struct time_stats	time;
+};
+
+void bch_bset_sort_state_free(struct bset_sort_state *);
+int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
+void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
+void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
+			 struct bset_sort_state *);
+void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
+				    struct bset_sort_state *);
+void bch_btree_sort_partial(struct btree_keys *, unsigned,
+			    struct bset_sort_state *);
+
+static inline void bch_btree_sort(struct btree_keys *b,
+				  struct bset_sort_state *state)
+{
+	bch_btree_sort_partial(b, 0, state);
+}
+
+struct bset_stats {
+	size_t sets_written, sets_unwritten;
+	size_t bytes_written, bytes_unwritten;
+	size_t floats, failed;
+};
+
+void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
+
+/* Bkey utility code */
+
+#define bset_bkey_last(i)	bkey_idx((struct bkey *) (i)->d, (i)->keys)
+
+static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
+{
+	return bkey_idx(i->start, idx);
+}
+
+static inline void bkey_init(struct bkey *k)
+{
+	*k = ZERO_KEY;
+}
+
 static __always_inline int64_t bkey_cmp(const struct bkey *l,
 					const struct bkey *r)
 {
@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
 		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
 }

+void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
+			      unsigned);
+bool __bch_cut_front(const struct bkey *, struct bkey *);
+bool __bch_cut_back(const struct bkey *, struct bkey *);
+
+static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+	BUG_ON(bkey_cmp(where, k) > 0);
+	return __bch_cut_front(where, k);
+}
+
+static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+	BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+	return __bch_cut_back(where, k);
+}
+
+#define PRECEDING_KEY(_k)					\
+({								\
+	struct bkey *_ret = NULL;				\
+								\
+	if (KEY_INODE(_k) || KEY_OFFSET(_k)) {			\
+		_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);	\
+								\
+		if (!_ret->low)					\
+			_ret->high--;				\
+		_ret->low--;					\
+	}							\
+								\
+	_ret;							\
+})
+
+static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
+{
+	return b->ops->key_invalid(b, k);
+}
+
+static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
+{
+	return b->ops->key_bad(b, k);
+}
+
+static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
+				    size_t size, const struct bkey *k)
+{
+	return b->ops->key_to_text(buf, size, k);
+}
+
+static inline bool bch_bkey_equal_header(const struct bkey *l,
+					 const struct bkey *r)
+{
+	return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
+		KEY_PTRS(l) == KEY_PTRS(r) &&
+		KEY_CSUM(l) == KEY_CSUM(l));
+}
+
 /* Keylists */

 struct keylist {
@ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l)

 struct bkey *bch_keylist_pop(struct keylist *);
 void bch_keylist_pop_front(struct keylist *);
-int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
+int __bch_keylist_realloc(struct keylist *, unsigned);

-void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
-			      unsigned);
-bool __bch_cut_front(const struct bkey *, struct bkey *);
-bool __bch_cut_back(const struct bkey *, struct bkey *);
+/* Debug stuff */

-static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
-{
-	BUG_ON(bkey_cmp(where, k) > 0);
-	return __bch_cut_front(where, k);
-}
+#ifdef CONFIG_BCACHE_DEBUG

-static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
-{
-	BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
-	return __bch_cut_back(where, k);
-}
+int __bch_count_data(struct btree_keys *);
+void __bch_check_keys(struct btree_keys *, const char *, ...);
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+void bch_dump_bucket(struct btree_keys *);

-const char *bch_ptr_status(struct cache_set *, const struct bkey *);
-bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
-bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
+#else

-bool bch_ptr_bad(struct btree *, const struct bkey *);
-
-static inline uint8_t gen_after(uint8_t a, uint8_t b)
-{
-	uint8_t r = a - b;
-	return r > 128U ? 0 : r;
-}
-
-static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
-				unsigned i)
-{
-	return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
-}
-
-static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
-				 unsigned i)
-{
-	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
-}
-
-
-typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
-
-struct bkey *bch_btree_iter_next(struct btree_iter *);
-struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
-					struct btree *, ptr_filter_fn);
-
-void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
-struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
-				   struct bkey *, struct bset_tree *);
-
-/* 32 bits total: */
-#define BKEY_MID_BITS		3
-#define BKEY_EXPONENT_BITS	7
-#define BKEY_MANTISSA_BITS	22
-#define BKEY_MANTISSA_MASK	((1 << BKEY_MANTISSA_BITS) - 1)
-
-struct bkey_float {
-	unsigned	exponent:BKEY_EXPONENT_BITS;
-	unsigned	m:BKEY_MID_BITS;
-	unsigned	mantissa:BKEY_MANTISSA_BITS;
-} __packed;
-
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE		128
-#define bset_tree_space(b)	(btree_data_space(b) / BSET_CACHELINE)
-
-#define bset_tree_bytes(b)	(bset_tree_space(b) * sizeof(struct bkey_float))
-#define bset_prev_bytes(b)	(bset_tree_space(b) * sizeof(uint8_t))
-
-void bch_bset_init_next(struct btree *);
-
-void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
-void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
-
-struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
-			   const struct bkey *);
-
-/*
- * Returns the first key that is strictly greater than search
- */
-static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
-					   const struct bkey *search)
-{
-	return search ? __bch_bset_search(b, t, search) : t->data->start;
-}
-
-#define PRECEDING_KEY(_k)					\
-({								\
-	struct bkey *_ret = NULL;				\
-								\
-	if (KEY_INODE(_k) || KEY_OFFSET(_k)) {			\
-		_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);	\
-								\
-		if (!_ret->low)					\
-			_ret->high--;				\
-		_ret->low--;					\
-	}							\
-								\
-	_ret;							\
-})
-
-bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
-void bch_btree_sort_lazy(struct btree *);
-void bch_btree_sort_into(struct btree *, struct btree *);
-void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
-void bch_btree_sort_partial(struct btree *, unsigned);
-
-static inline void bch_btree_sort(struct btree *b)
-{
-	bch_btree_sort_partial(b, 0);
-}
-
-int bch_bset_print_stats(struct cache_set *, char *);
+static inline int __bch_count_data(struct btree_keys *b) { return -1; }
+static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
+static inline void bch_dump_bucket(struct btree_keys *b) {}
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+
+#endif
+
+static inline bool btree_keys_expensive_checks(struct btree_keys *b)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	return *b->expensive_debug_checks;
+#else
+	return false;
+#endif
+}
+
+static inline int bch_count_data(struct btree_keys *b)
+{
+	return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
+}
+
+#define bch_check_keys(b, ...)						\
+do {									\
+	if (btree_keys_expensive_checks(b))				\
+		__bch_check_keys(b, __VA_ARGS__);			\
+} while (0)

 #endif
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@ -130,20 +130,12 @@ struct btree {
 	unsigned long		flags;
 	uint16_t		written;	/* would be nice to kill */
 	uint8_t			level;
-	uint8_t			nsets;
-	uint8_t			page_order;

-	/*
-	 * Set of sorted keys - the real btree node - plus a binary search tree
-	 *
-	 * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
-	 * to the memory we have allocated for this btree node. Additionally,
-	 * set[0]->data points to the entire btree node as it exists on disk.
-	 */
-	struct bset_tree	sets[MAX_BSETS];
+	struct btree_keys	keys;

 	/* For outstanding btree writes, used as a lock - protects write_idx */
-	struct closure_with_waitlist	io;
+	struct closure		io;
+	struct semaphore	io_mutex;

 	struct list_head	list;
 	struct delayed_work	work;
@ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b)
 	return b->writes + (btree_node_write_idx(b) ^ 1);
 }

-static inline unsigned bset_offset(struct btree *b, struct bset *i)
+static inline struct bset *btree_bset_first(struct btree *b)
 {
-	return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
+	return b->keys.set->data;
 }

-static inline struct bset *write_block(struct btree *b)
+static inline struct bset *btree_bset_last(struct btree *b)
 {
-	return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
+	return bset_tree_last(&b->keys)->data;
 }

-static inline bool bset_written(struct btree *b, struct bset_tree *t)
+static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
 {
-	return t->data < write_block(b);
-}
-
-static inline bool bkey_written(struct btree *b, struct bkey *k)
-{
-	return k < write_block(b)->start;
+	return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
 }

 static inline void set_gc_sectors(struct cache_set *c)
@ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c)
 	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
 }

-static inline struct bkey *bch_btree_iter_init(struct btree *b,
-					       struct btree_iter *iter,
-					       struct bkey *search)
-{
-	return __bch_btree_iter_init(b, iter, search, b->sets);
-}
-
-static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
-{
-	if (b->level)
-		return bch_btree_ptr_invalid(b->c, k);
-	else
-		return bch_extent_ptr_invalid(b->c, k);
-}
-
 void bkey_put(struct cache_set *c, struct bkey *k);

 /* Looping macros */
@ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k);
 	     iter++)							\
 		hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)

-#define for_each_key_filter(b, k, iter, filter)				\
-	for (bch_btree_iter_init((b), (iter), NULL);			\
-	     ((k) = bch_btree_iter_next_filter((iter), b, filter));)
-
-#define for_each_key(b, k, iter)					\
-	for (bch_btree_iter_init((b), (iter), NULL);			\
-	     ((k) = bch_btree_iter_next(iter));)
-
 /* Recursing down the btree */

 struct btree_op {
+	/* for waiting on btree reserve in btree_split() */
+	wait_queue_t		wait;
+
 	/* Btree level at which we start taking write locks */
 	short			lock;

@ -249,6 +216,7 @@ struct btree_op {
 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
 {
 	memset(op, 0, sizeof(struct btree_op));
+	init_wait(&op->wait);
 	op->lock = write_lock_level;
 }

@ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b)
 	(w ? up_write : up_read)(&b->lock);
 }

-void bch_btree_node_read(struct btree *);
+void bch_btree_node_read_done(struct btree *);
 void bch_btree_node_write(struct btree *, struct closure *);

 void bch_btree_set_root(struct btree *);
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@ -11,19 +11,6 @@

 #include "closure.h"

-#define CL_FIELD(type, field)					\
-	case TYPE_ ## type:					\
-	return &container_of(cl, struct type, cl)->field
-
-static struct closure_waitlist *closure_waitlist(struct closure *cl)
-{
-	switch (cl->type) {
-		CL_FIELD(closure_with_waitlist, wait);
-	default:
-		return NULL;
-	}
-}
-
 static inline void closure_put_after_sub(struct closure *cl, int flags)
 {
 	int r = flags & CLOSURE_REMAINING_MASK;
@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 			closure_queue(cl);
 		} else {
 			struct closure *parent = cl->parent;
-			struct closure_waitlist *wait = closure_waitlist(cl);
 			closure_fn *destructor = cl->fn;

 			closure_debug_destroy(cl);

-			smp_mb();
-			atomic_set(&cl->remaining, -1);
-
-			if (wait)
-				closure_wake_up(wait);
-
 			if (destructor)
 				destructor(cl);

@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v)
 }
 EXPORT_SYMBOL(closure_sub);

+/**
+ * closure_put - decrement a closure's refcount
+ */
 void closure_put(struct closure *cl)
 {
 	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
 }
 EXPORT_SYMBOL(closure_put);

-static void set_waiting(struct closure *cl, unsigned long f)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-	cl->waiting_on = f;
-#endif
-}
-
+/**
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
 void __closure_wake_up(struct closure_waitlist *wait_list)
 {
 	struct llist_node *list;
@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
 		cl = container_of(reverse, struct closure, list);
 		reverse = llist_next(reverse);

-		set_waiting(cl, 0);
+		closure_set_waiting(cl, 0);
 		closure_sub(cl, CLOSURE_WAITING + 1);
 	}
 }
 EXPORT_SYMBOL(__closure_wake_up);

-bool closure_wait(struct closure_waitlist *list, struct closure *cl)
+/**
+ * closure_wait - add a closure to a waitlist
+ *
+ * @waitlist will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ *
+ */
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
 {
 	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
 		return false;

-	set_waiting(cl, _RET_IP_);
+	closure_set_waiting(cl, _RET_IP_);
 	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
-	llist_add(&cl->list, &list->list);
+	llist_add(&cl->list, &waitlist->list);

 	return true;
 }
 EXPORT_SYMBOL(closure_wait);

 /**
- * closure_sync() - sleep until a closure a closure has nothing left to wait on
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
 *
 * Sleeps until the refcount hits 1 - the thread that's running the closure owns
 * the last refcount.
@ -148,46 +134,6 @@ void closure_sync(struct closure *cl)
 }
 EXPORT_SYMBOL(closure_sync);

-/**
- * closure_trylock() - try to acquire the closure, without waiting
- * @cl:		closure to lock
- *
- * Returns true if the closure was succesfully locked.
- */
-bool closure_trylock(struct closure *cl, struct closure *parent)
-{
-	if (atomic_cmpxchg(&cl->remaining, -1,
-			   CLOSURE_REMAINING_INITIALIZER) != -1)
-		return false;
-
-	smp_mb();
-
-	cl->parent = parent;
-	if (parent)
-		closure_get(parent);
-
-	closure_set_ret_ip(cl);
-	closure_debug_create(cl);
-	return true;
-}
-EXPORT_SYMBOL(closure_trylock);
-
-void __closure_lock(struct closure *cl, struct closure *parent,
-		    struct closure_waitlist *wait_list)
-{
-	struct closure wait;
-	closure_init_stack(&wait);
-
-	while (1) {
-		if (closure_trylock(cl, parent))
-			return;
-
-		closure_wait_event(wait_list, &wait,
-				   atomic_read(&cl->remaining) == -1);
-	}
-}
-EXPORT_SYMBOL(__closure_lock);
-
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG

 static LIST_HEAD(closure_list);
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@ -72,30 +72,6 @@
 * closure - _always_ use continue_at(). Doing so consistently will help
 * eliminate an entire class of particularly pernicious races.
 *
- * For a closure to wait on an arbitrary event, we need to introduce waitlists:
- *
- * struct closure_waitlist list;
- * closure_wait_event(list, cl, condition);
- * closure_wake_up(wait_list);
- *
- * These work analagously to wait_event() and wake_up() - except that instead of
- * operating on the current thread (for wait_event()) and lists of threads, they
- * operate on an explicit closure and lists of closures.
- *
- * Because it's a closure we can now wait either synchronously or
- * asynchronously. closure_wait_event() returns the current value of the
- * condition, and if it returned false continue_at() or closure_sync() can be
- * used to wait for it to become true.
- *
- * It's useful for waiting on things when you can't sleep in the context in
- * which you must check the condition (perhaps a spinlock held, or you might be
- * beneath generic_make_request() - in which case you can't sleep on IO).
- *
- * closure_wait_event() will wait either synchronously or asynchronously,
- * depending on whether the closure is in blocking mode or not. You can pick a
- * mode explicitly with closure_wait_event_sync() and
- * closure_wait_event_async(), which do just what you might expect.
- *
 * Lastly, you might have a wait list dedicated to a specific event, and have no
 * need for specifying the condition - you just want to wait until someone runs
 * closure_wake_up() on the appropriate wait list. In that case, just use
@ -121,40 +97,6 @@
 * All this implies that a closure should typically be embedded in a particular
 * struct (which its refcount will normally control the lifetime of), and that
 * struct can very much be thought of as a stack frame.
- *
- * Locking:
- *
- * Closures are based on work items but they can be thought of as more like
- * threads - in that like threads and unlike work items they have a well
- * defined lifetime; they are created (with closure_init()) and eventually
- * complete after a continue_at(cl, NULL, NULL).
- *
- * Suppose you've got some larger structure with a closure embedded in it that's
- * used for periodically doing garbage collection. You only want one garbage
- * collection happening at a time, so the natural thing to do is protect it with
- * a lock. However, it's difficult to use a lock protecting a closure correctly
- * because the unlock should come after the last continue_to() (additionally, if
- * you're using the closure asynchronously a mutex won't work since a mutex has
- * to be unlocked by the same process that locked it).
- *
- * So to make it less error prone and more efficient, we also have the ability
- * to use closures as locks:
- *
- * closure_init_unlocked();
- * closure_trylock();
- *
- * That's all we need for trylock() - the last closure_put() implicitly unlocks
- * it for you.  But for closure_lock(), we also need a wait list:
- *
- * struct closure_with_waitlist frobnicator_cl;
- *
- * closure_init_unlocked(&frobnicator_cl);
- * closure_lock(&frobnicator_cl);
- *
- * A closure_with_waitlist embeds a closure and a wait list - much like struct
- * delayed_work embeds a work item and a timer_list. The important thing is, use
- * it exactly like you would a regular closure and closure_put() will magically
- * handle everything for you.
 */

 struct closure;
@ -164,12 +106,6 @@ struct closure_waitlist {
 	struct llist_head	list;
 };

-enum closure_type {
-	TYPE_closure				= 0,
-	TYPE_closure_with_waitlist		= 1,
-	MAX_CLOSURE_TYPE			= 1,
-};
-
 enum closure_state {
 	/*
 	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
@ -224,8 +160,6 @@ struct closure {

 	atomic_t		remaining;

-	enum closure_type	type;
-
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 #define CLOSURE_MAGIC_DEAD	0xc054dead
 #define CLOSURE_MAGIC_ALIVE	0xc054a11e
@ -237,34 +171,12 @@ struct closure {
 #endif
 };

-struct closure_with_waitlist {
-	struct closure		cl;
-	struct closure_waitlist	wait;
-};
-
-extern unsigned invalid_closure_type(void);
-
-#define __CLOSURE_TYPE(cl, _t)						\
-	  __builtin_types_compatible_p(typeof(cl), struct _t)		\
-		? TYPE_ ## _t :						\
-
-#define __closure_type(cl)						\
-(									\
-	__CLOSURE_TYPE(cl, closure)					\
-	__CLOSURE_TYPE(cl, closure_with_waitlist)			\
-	invalid_closure_type()						\
-)
-
 void closure_sub(struct closure *cl, int v);
 void closure_put(struct closure *cl);
 void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void closure_sync(struct closure *cl);

-bool closure_trylock(struct closure *cl, struct closure *parent);
-void __closure_lock(struct closure *cl, struct closure *parent,
-		    struct closure_waitlist *wait_list);
-
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG

 void closure_debug_init(void);
@ -293,114 +205,13 @@ static inline void closure_set_ret_ip(struct closure *cl)
 #endif
 }

-static inline void closure_get(struct closure *cl)
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
 {
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-	BUG_ON((atomic_inc_return(&cl->remaining) &
-		CLOSURE_REMAINING_MASK) <= 1);
-#else
-	atomic_inc(&cl->remaining);
+	cl->waiting_on = f;
 #endif
 }

-static inline void closure_set_stopped(struct closure *cl)
-{
-	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
-}
-
-static inline bool closure_is_unlocked(struct closure *cl)
-{
-	return atomic_read(&cl->remaining) == -1;
-}
-
-static inline void do_closure_init(struct closure *cl, struct closure *parent,
-				   bool running)
-{
-	cl->parent = parent;
-	if (parent)
-		closure_get(parent);
-
-	if (running) {
-		closure_debug_create(cl);
-		atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-	} else
-		atomic_set(&cl->remaining, -1);
-
-	closure_set_ip(cl);
-}
-
-/*
- * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
- * the result of __closure_type() is thrown away, it's used merely for type
- * checking.
- */
-#define __to_internal_closure(cl)				\
-({								\
-	BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE);	\
-	(struct closure *) cl;					\
-})
-
-#define closure_init_type(cl, parent, running)			\
-do {								\
-	struct closure *_cl = __to_internal_closure(cl);	\
-	_cl->type = __closure_type(*(cl));			\
-	do_closure_init(_cl, parent, running);			\
-} while (0)
-
-/**
- * __closure_init() - Initialize a closure, skipping the memset()
- *
- * May be used instead of closure_init() when memory has already been zeroed.
- */
-#define __closure_init(cl, parent)				\
-	closure_init_type(cl, parent, true)
-
-/**
- * closure_init() - Initialize a closure, setting the refcount to 1
- * @cl:		closure to initialize
- * @parent:	parent of the new closure. cl will take a refcount on it for its
- *		lifetime; may be NULL.
- */
-#define closure_init(cl, parent)				\
-do {								\
-	memset((cl), 0, sizeof(*(cl)));				\
-	__closure_init(cl, parent);				\
-} while (0)
-
-static inline void closure_init_stack(struct closure *cl)
-{
-	memset(cl, 0, sizeof(struct closure));
-	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
-}
-
-/**
- * closure_init_unlocked() - Initialize a closure but leave it unlocked.
- * @cl:		closure to initialize
- *
- * For when the closure will be used as a lock. The closure may not be used
- * until after a closure_lock() or closure_trylock().
- */
-#define closure_init_unlocked(cl)				\
-do {								\
-	memset((cl), 0, sizeof(*(cl)));				\
-	closure_init_type(cl, NULL, false);			\
-} while (0)
-
-/**
- * closure_lock() - lock and initialize a closure.
- * @cl:		the closure to lock
- * @parent:	the new parent for this closure
- *
- * The closure must be of one of the types that has a waitlist (otherwise we
- * wouldn't be able to sleep on contention).
- *
- * @parent has exactly the same meaning as in closure_init(); if non null, the
- * closure will take a reference on @parent which will be released when it is
- * unlocked.
- */
-#define closure_lock(cl, parent)				\
-	__closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
-
 static inline void __closure_end_sleep(struct closure *cl)
 {
 	__set_current_state(TASK_RUNNING);
@ -419,65 +230,9 @@ static inline void __closure_start_sleep(struct closure *cl)
 		atomic_add(CLOSURE_SLEEPING, &cl->remaining);
 }

-/**
- * closure_wake_up() - wake up all closures on a wait list.
- */
-static inline void closure_wake_up(struct closure_waitlist *list)
+static inline void closure_set_stopped(struct closure *cl)
 {
-	smp_mb();
-	__closure_wake_up(list);
-}
-
-/*
- * Wait on an event, synchronously or asynchronously - analogous to wait_event()
- * but for closures.
- *
- * The loop is oddly structured so as to avoid a race; we must check the
- * condition again after we've added ourself to the waitlist. We know if we were
- * already on the waitlist because closure_wait() returns false; thus, we only
- * schedule or break if closure_wait() returns false. If it returns true, we
- * just loop again - rechecking the condition.
- *
- * The __closure_wake_up() is necessary because we may race with the event
- * becoming true; i.e. we see event false -> wait -> recheck condition, but the
- * thread that made the event true may have called closure_wake_up() before we
- * added ourself to the wait list.
- *
- * We have to call closure_sync() at the end instead of just
- * __closure_end_sleep() because a different thread might've called
- * closure_wake_up() before us and gotten preempted before they dropped the
- * refcount on our closure. If this was a stack allocated closure, that would be
- * bad.
- */
-#define closure_wait_event(list, cl, condition)				\
-({									\
-	typeof(condition) ret;						\
-									\
-	while (1) {							\
-		ret = (condition);					\
-		if (ret) {						\
-			__closure_wake_up(list);			\
-			closure_sync(cl);				\
-			break;						\
-		}							\
-									\
-		__closure_start_sleep(cl);				\
-									\
-		if (!closure_wait(list, cl))				\
-			schedule();					\
-	}								\
-									\
-	ret;								\
-})
-
-static inline void closure_queue(struct closure *cl)
-{
-	struct workqueue_struct *wq = cl->wq;
-	if (wq) {
-		INIT_WORK(&cl->work, cl->work.func);
-		BUG_ON(!queue_work(wq, &cl->work));
-	} else
-		cl->fn(cl);
+	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
 }

 static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
@ -491,6 +246,76 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
 	smp_mb__before_atomic_dec();
 }

+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		BUG_ON(!queue_work(wq, &cl->work));
+	} else
+		cl->fn(cl);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	BUG_ON((atomic_inc_return(&cl->remaining) &
+		CLOSURE_REMAINING_MASK) <= 1);
+#else
+	atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl:		closure to initialize
+ * @parent:	parent of the new closure. cl will take a refcount on it for its
+ *		lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+	memset(cl, 0, sizeof(struct closure));
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+	closure_debug_create(cl);
+	closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list.
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+	smp_mb();
+	__closure_wake_up(list);
+}
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * NOTE: This macro expands to a return in the calling function!
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ */
 #define continue_at(_cl, _fn, _wq)					\
 do {									\
 	set_closure_fn(_cl, _fn, _wq);					\
@ -498,8 +323,28 @@ do {									\
 	return;								\
 } while (0)

+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
 #define closure_return(_cl)	continue_at((_cl), NULL, NULL)

+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * NOTE: like continue_at(), this macro expands to a return in the caller!
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
 #define continue_at_nobarrier(_cl, _fn, _wq)				\
 do {									\
 	set_closure_fn(_cl, _fn, _wq);					\
@ -507,6 +352,15 @@ do {									\
 	return;								\
 } while (0)

+/**
+ * closure_return - finish execution of a closure, with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
 #define closure_return_with_destructor(_cl, _destructor)		\
 do {									\
 	set_closure_fn(_cl, _destructor, NULL);				\
@ -514,6 +368,13 @@ do {									\
 	return;								\
 } while (0)

+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
 static inline void closure_call(struct closure *cl, closure_fn fn,
 				struct workqueue_struct *wq,
 				struct closure *parent)
@ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
 	continue_at_nobarrier(cl, fn, wq);
 }

-static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
-					struct workqueue_struct *wq,
-					struct closure *parent)
-{
-	if (closure_trylock(cl, parent))
-		continue_at_nobarrier(cl, fn, wq);
-}
-
 #endif /* _LINUX_CLOSURE_H */
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@ -8,6 +8,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"

 #include <linux/console.h>
 #include <linux/debugfs.h>
@ -17,156 +18,88 @@

 static struct dentry *debug;

-const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
-{
-	unsigned i;
-
-	for (i = 0; i < KEY_PTRS(k); i++)
-		if (ptr_available(c, k, i)) {
-			struct cache *ca = PTR_CACHE(c, k, i);
-			size_t bucket = PTR_BUCKET_NR(c, k, i);
-			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
-
-			if (KEY_SIZE(k) + r > c->sb.bucket_size)
-				return "bad, length too big";
-			if (bucket <  ca->sb.first_bucket)
-				return "bad, short offset";
-			if (bucket >= ca->sb.nbuckets)
-				return "bad, offset past end of device";
-			if (ptr_stale(c, k, i))
-				return "stale";
-		}
-
-	if (!bkey_cmp(k, &ZERO_KEY))
-		return "bad, null key";
-	if (!KEY_PTRS(k))
-		return "bad, no pointers";
-	if (!KEY_SIZE(k))
-		return "zeroed key";
-	return "";
-}
-
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
-	unsigned i = 0;
-	char *out = buf, *end = buf + size;
-
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
-
-	p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
-
-	if (KEY_PTRS(k))
-		while (1) {
-			p("%llu:%llu gen %llu",
-			  PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
-
-			if (++i == KEY_PTRS(k))
-				break;
-
-			p(", ");
-		}
-
-	p("]");
-
-	if (KEY_DIRTY(k))
-		p(" dirty");
-	if (KEY_CSUM(k))
-		p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
-#undef p
-	return out - buf;
-}
-
 #ifdef CONFIG_BCACHE_DEBUG

-static void dump_bset(struct btree *b, struct bset *i)
-{
-	struct bkey *k, *next;
-	unsigned j;
-	char buf[80];
+#define for_each_written_bset(b, start, i)				\
+	for (i = (start);						\
+	     (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
+	     i->seq == (start)->seq;					\
+	     i = (void *) i + set_blocks(i, block_bytes(b->c)) *	\
+		 block_bytes(b->c))

-	for (k = i->start; k < end(i); k = next) {
-		next = bkey_next(k);
-
-		bch_bkey_to_text(buf, sizeof(buf), k);
-		printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
-		       (uint64_t *) k - i->d, i->keys, buf);
-
-		for (j = 0; j < KEY_PTRS(k); j++) {
-			size_t n = PTR_BUCKET_NR(b->c, k, j);
-			printk(" bucket %zu", n);
-
-			if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
-				printk(" prio %i",
-				       PTR_BUCKET(b->c, k, j)->prio);
-		}
-
-		printk(" %s\n", bch_ptr_status(b->c, k));
-
-		if (next < end(i) &&
-		    bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
-			printk(KERN_ERR "Key skipped backwards\n");
-	}
-}
-
-static void bch_dump_bucket(struct btree *b)
-{
-	unsigned i;
-
-	console_lock();
-	for (i = 0; i <= b->nsets; i++)
-		dump_bset(b, b->sets[i].data);
-	console_unlock();
-}
-
-void bch_btree_verify(struct btree *b, struct bset *new)
+void bch_btree_verify(struct btree *b)
 {
 	struct btree *v = b->c->verify_data;
-	struct closure cl;
-	closure_init_stack(&cl);
+	struct bset *ondisk, *sorted, *inmemory;
+	struct bio *bio;

-	if (!b->c->verify)
+	if (!b->c->verify || !b->c->verify_ondisk)
 		return;

-	closure_wait_event(&b->io.wait, &cl,
-			   atomic_read(&b->io.cl.remaining) == -1);
-
+	down(&b->io_mutex);
 	mutex_lock(&b->c->verify_lock);

+	ondisk = b->c->verify_ondisk;
+	sorted = b->c->verify_data->keys.set->data;
+	inmemory = b->keys.set->data;
+
 	bkey_copy(&v->key, &b->key);
 	v->written = 0;
 	v->level = b->level;
+	v->keys.ops = b->keys.ops;

-	bch_btree_node_read(v);
-	closure_wait_event(&v->io.wait, &cl,
-			   atomic_read(&b->io.cl.remaining) == -1);
+	bio = bch_bbio_alloc(b->c);
+	bio->bi_bdev		= PTR_CACHE(b->c, &b->key, 0)->bdev;
+	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
+	bio->bi_iter.bi_size	= KEY_SIZE(&v->key) << 9;
+	bch_bio_map(bio, sorted);

-	if (new->keys != v->sets[0].data->keys ||
-	    memcmp(new->start,
-		   v->sets[0].data->start,
-		   (void *) end(new) - (void *) new->start)) {
-		unsigned i, j;
+	submit_bio_wait(REQ_META|READ_SYNC, bio);
+	bch_bbio_free(bio, b->c);
+
+	memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
+
+	bch_btree_node_read_done(v);
+	sorted = v->keys.set->data;
+
+	if (inmemory->keys != sorted->keys ||
+	    memcmp(inmemory->start,
+		   sorted->start,
+		   (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+		struct bset *i;
+		unsigned j;

 		console_lock();

-		printk(KERN_ERR "*** original memory node:\n");
-		for (i = 0; i <= b->nsets; i++)
-			dump_bset(b, b->sets[i].data);
+		printk(KERN_ERR "*** in memory:\n");
+		bch_dump_bset(&b->keys, inmemory, 0);

-		printk(KERN_ERR "*** sorted memory node:\n");
-		dump_bset(b, new);
+		printk(KERN_ERR "*** read back in:\n");
+		bch_dump_bset(&v->keys, sorted, 0);

-		printk(KERN_ERR "*** on disk node:\n");
-		dump_bset(v, v->sets[0].data);
+		for_each_written_bset(b, ondisk, i) {
+			unsigned block = ((void *) i - (void *) ondisk) /
+				block_bytes(b->c);

-		for (j = 0; j < new->keys; j++)
-			if (new->d[j] != v->sets[0].data->d[j])
+			printk(KERN_ERR "*** on disk block %u:\n", block);
+			bch_dump_bset(&b->keys, i, block);
+		}
+
+		printk(KERN_ERR "*** block %zu not written\n",
+		       ((void *) i - (void *) ondisk) / block_bytes(b->c));
+
+		for (j = 0; j < inmemory->keys; j++)
+			if (inmemory->d[j] != sorted->d[j])
 				break;

+		printk(KERN_ERR "b->written %u\n", b->written);
+
 		console_unlock();
 		panic("verify failed at %u\n", j);
 	}

 	mutex_unlock(&b->c->verify_lock);
+	up(&b->io_mutex);
 }

 void bch_data_verify(struct cached_dev *dc, struct bio *bio)
@ -207,74 +140,6 @@ out_put:
 	bio_put(check);
 }

-int __bch_count_data(struct btree *b)
-{
-	unsigned ret = 0;
-	struct btree_iter iter;
-	struct bkey *k;
-
-	if (!b->level)
-		for_each_key(b, k, &iter)
-			ret += KEY_SIZE(k);
-	return ret;
-}
-
-void __bch_check_keys(struct btree *b, const char *fmt, ...)
-{
-	va_list args;
-	struct bkey *k, *p = NULL;
-	struct btree_iter iter;
-	const char *err;
-
-	for_each_key(b, k, &iter) {
-		if (!b->level) {
-			err = "Keys out of order";
-			if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
-				goto bug;
-
-			if (bch_ptr_invalid(b, k))
-				continue;
-
-			err =  "Overlapping keys";
-			if (p && bkey_cmp(p, &START_KEY(k)) > 0)
-				goto bug;
-		} else {
-			if (bch_ptr_bad(b, k))
-				continue;
-
-			err = "Duplicate keys";
-			if (p && !bkey_cmp(p, k))
-				goto bug;
-		}
-		p = k;
-	}
-
-	err = "Key larger than btree node key";
-	if (p && bkey_cmp(p, &b->key) > 0)
-		goto bug;
-
-	return;
-bug:
-	bch_dump_bucket(b);
-
-	va_start(args, fmt);
-	vprintk(fmt, args);
-	va_end(args);
-
-	panic("bcache error: %s:\n", err);
-}
-
-void bch_btree_iter_next_check(struct btree_iter *iter)
-{
-	struct bkey *k = iter->data->k, *next = bkey_next(k);
-
-	if (next < iter->data->end &&
-	    bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
-		bch_dump_bucket(iter->b);
-		panic("Key skipped backwards\n");
-	}
-}
-
 #endif

 #ifdef CONFIG_DEBUG_FS
@ -321,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
 		if (!w)
 			break;

-		bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
+		bch_extent_to_text(kbuf, sizeof(kbuf), &w->key);
 		i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
 		bch_keybuf_del(&i->keys, w);
 	}
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@ -1,47 +1,30 @@
 #ifndef _BCACHE_DEBUG_H
 #define _BCACHE_DEBUG_H

-/* Btree/bkey debug printing */
-
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
+struct bio;
+struct cached_dev;
+struct cache_set;

 #ifdef CONFIG_BCACHE_DEBUG

-void bch_btree_verify(struct btree *, struct bset *);
+void bch_btree_verify(struct btree *);
 void bch_data_verify(struct cached_dev *, struct bio *);
-int __bch_count_data(struct btree *);
-void __bch_check_keys(struct btree *, const char *, ...);
-void bch_btree_iter_next_check(struct btree_iter *);

-#define EBUG_ON(cond)			BUG_ON(cond)
 #define expensive_debug_checks(c)	((c)->expensive_debug_checks)
 #define key_merging_disabled(c)		((c)->key_merging_disabled)
 #define bypass_torture_test(d)		((d)->bypass_torture_test)

 #else /* DEBUG */

-static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
+static inline void bch_btree_verify(struct btree *b) {}
 static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
-static inline int __bch_count_data(struct btree *b) { return -1; }
-static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
-static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}

-#define EBUG_ON(cond)			do { if (cond); } while (0)
 #define expensive_debug_checks(c)	0
 #define key_merging_disabled(c)		0
 #define bypass_torture_test(d)		0

 #endif

-#define bch_count_data(b)						\
-	(expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
-
-#define bch_check_keys(b, ...)						\
-do {									\
-	if (expensive_debug_checks((b)->c))				\
-		__bch_check_keys(b, __VA_ARGS__);			\
-} while (0)
-
 #ifdef CONFIG_DEBUG_FS
 void bch_debug_init_cache_set(struct cache_set *);
 #else
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@ -0,0 +1,616 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+#include "writeback.h"
+
+static void sort_key_next(struct btree_iter *iter,
+			  struct btree_iter_set *i)
+{
+	i->k = bkey_next(i->k);
+
+	if (i->k == i->end)
+		*i = iter->data[--iter->used];
+}
+
+static bool bch_key_sort_cmp(struct btree_iter_set l,
+			     struct btree_iter_set r)
+{
+	int64_t c = bkey_cmp(l.k, r.k);
+
+	return c ? c > 0 : l.k < r.k;
+}
+
+static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i)) {
+			struct cache *ca = PTR_CACHE(c, k, i);
+			size_t bucket = PTR_BUCKET_NR(c, k, i);
+			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+			if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+			    bucket <  ca->sb.first_bucket ||
+			    bucket >= ca->sb.nbuckets)
+				return true;
+		}
+
+	return false;
+}
+
+/* Common among btree and extent ptrs */
+
+static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i)) {
+			struct cache *ca = PTR_CACHE(c, k, i);
+			size_t bucket = PTR_BUCKET_NR(c, k, i);
+			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+			if (KEY_SIZE(k) + r > c->sb.bucket_size)
+				return "bad, length too big";
+			if (bucket <  ca->sb.first_bucket)
+				return "bad, short offset";
+			if (bucket >= ca->sb.nbuckets)
+				return "bad, offset past end of device";
+			if (ptr_stale(c, k, i))
+				return "stale";
+		}
+
+	if (!bkey_cmp(k, &ZERO_KEY))
+		return "bad, null key";
+	if (!KEY_PTRS(k))
+		return "bad, no pointers";
+	if (!KEY_SIZE(k))
+		return "zeroed key";
+	return "";
+}
+
+void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
+{
+	unsigned i = 0;
+	char *out = buf, *end = buf + size;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		if (i)
+			p(", ");
+
+		if (PTR_DEV(k, i) == PTR_CHECK_DEV)
+			p("check dev");
+		else
+			p("%llu:%llu gen %llu", PTR_DEV(k, i),
+			  PTR_OFFSET(k, i), PTR_GEN(k, i));
+	}
+
+	p("]");
+
+	if (KEY_DIRTY(k))
+		p(" dirty");
+	if (KEY_CSUM(k))
+		p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
+#undef p
+}
+
+static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
+{
+	struct btree *b = container_of(keys, struct btree, keys);
+	unsigned j;
+	char buf[80];
+
+	bch_extent_to_text(buf, sizeof(buf), k);
+	printk(" %s", buf);
+
+	for (j = 0; j < KEY_PTRS(k); j++) {
+		size_t n = PTR_BUCKET_NR(b->c, k, j);
+		printk(" bucket %zu", n);
+
+		if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+			printk(" prio %i",
+			       PTR_BUCKET(b->c, k, j)->prio);
+	}
+
+	printk(" %s\n", bch_ptr_status(b->c, k));
+}
+
+/* Btree ptrs */
+
+bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+	char buf[80];
+
+	if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
+		goto bad;
+
+	if (__ptr_invalid(c, k))
+		goto bad;
+
+	return false;
+bad:
+	bch_extent_to_text(buf, sizeof(buf), k);
+	cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
+	return true;
+}
+
+static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	return __bch_btree_ptr_invalid(b->c, k);
+}
+
+static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
+{
+	unsigned i;
+	char buf[80];
+	struct bucket *g;
+
+	if (mutex_trylock(&b->c->bucket_lock)) {
+		for (i = 0; i < KEY_PTRS(k); i++)
+			if (ptr_available(b->c, k, i)) {
+				g = PTR_BUCKET(b->c, k, i);
+
+				if (KEY_DIRTY(k) ||
+				    g->prio != BTREE_PRIO ||
+				    (b->c->gc_mark_valid &&
+				     GC_MARK(g) != GC_MARK_METADATA))
+					goto err;
+			}
+
+		mutex_unlock(&b->c->bucket_lock);
+	}
+
+	return false;
+err:
+	mutex_unlock(&b->c->bucket_lock);
+	bch_extent_to_text(buf, sizeof(buf), k);
+	btree_bug(b,
+"inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+		  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+		  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+	return true;
+}
+
+static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	unsigned i;
+
+	if (!bkey_cmp(k, &ZERO_KEY) ||
+	    !KEY_PTRS(k) ||
+	    bch_ptr_invalid(bk, k))
+		return true;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (!ptr_available(b->c, k, i) ||
+		    ptr_stale(b->c, k, i))
+			return true;
+
+	if (expensive_debug_checks(b->c) &&
+	    btree_ptr_bad_expensive(b, k))
+		return true;
+
+	return false;
+}
+
+static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
+				       struct bkey *insert,
+				       struct btree_iter *iter,
+				       struct bkey *replace_key)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+
+	if (!KEY_OFFSET(insert))
+		btree_current_write(b)->prio_blocked++;
+
+	return false;
+}
+
+const struct btree_keys_ops bch_btree_keys_ops = {
+	.sort_cmp	= bch_key_sort_cmp,
+	.insert_fixup	= bch_btree_ptr_insert_fixup,
+	.key_invalid	= bch_btree_ptr_invalid,
+	.key_bad	= bch_btree_ptr_bad,
+	.key_to_text	= bch_extent_to_text,
+	.key_dump	= bch_bkey_dump,
+};
+
+/* Extents */
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+static bool bch_extent_sort_cmp(struct btree_iter_set l,
+				struct btree_iter_set r)
+{
+	int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+
+	return c ? c > 0 : l.k < r.k;
+}
+
+static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
+					  struct bkey *tmp)
+{
+	while (iter->used > 1) {
+		struct btree_iter_set *top = iter->data, *i = top + 1;
+
+		if (iter->used > 2 &&
+		    bch_extent_sort_cmp(i[0], i[1]))
+			i++;
+
+		if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
+			break;
+
+		if (!KEY_SIZE(i->k)) {
+			sort_key_next(iter, i);
+			heap_sift(iter, i - top, bch_extent_sort_cmp);
+			continue;
+		}
+
+		if (top->k > i->k) {
+			if (bkey_cmp(top->k, i->k) >= 0)
+				sort_key_next(iter, i);
+			else
+				bch_cut_front(top->k, i->k);
+
+			heap_sift(iter, i - top, bch_extent_sort_cmp);
+		} else {
+			/* can't happen because of comparison func */
+			BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+
+			if (bkey_cmp(i->k, top->k) < 0) {
+				bkey_copy(tmp, top->k);
+
+				bch_cut_back(&START_KEY(i->k), tmp);
+				bch_cut_front(i->k, top->k);
+				heap_sift(iter, 0, bch_extent_sort_cmp);
+
+				return tmp;
+			} else {
+				bch_cut_back(&START_KEY(i->k), top->k);
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static bool bch_extent_insert_fixup(struct btree_keys *b,
+				    struct bkey *insert,
+				    struct btree_iter *iter,
+				    struct bkey *replace_key)
+{
+	struct cache_set *c = container_of(b, struct btree, keys)->c;
+
+	void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
+	{
+		if (KEY_DIRTY(k))
+			bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
+						     offset, -sectors);
+	}
+
+	uint64_t old_offset;
+	unsigned old_size, sectors_found = 0;
+
+	BUG_ON(!KEY_OFFSET(insert));
+	BUG_ON(!KEY_SIZE(insert));
+
+	while (1) {
+		struct bkey *k = bch_btree_iter_next(iter);
+		if (!k)
+			break;
+
+		if (bkey_cmp(&START_KEY(k), insert) >= 0) {
+			if (KEY_SIZE(k))
+				break;
+			else
+				continue;
+		}
+
+		if (bkey_cmp(k, &START_KEY(insert)) <= 0)
+			continue;
+
+		old_offset = KEY_START(k);
+		old_size = KEY_SIZE(k);
+
+		/*
+		 * We might overlap with 0 size extents; we can't skip these
+		 * because if they're in the set we're inserting to we have to
+		 * adjust them so they don't overlap with the key we're
+		 * inserting. But we don't want to check them for replace
+		 * operations.
+		 */
+
+		if (replace_key && KEY_SIZE(k)) {
+			/*
+			 * k might have been split since we inserted/found the
+			 * key we're replacing
+			 */
+			unsigned i;
+			uint64_t offset = KEY_START(k) -
+				KEY_START(replace_key);
+
+			/* But it must be a subset of the replace key */
+			if (KEY_START(k) < KEY_START(replace_key) ||
+			    KEY_OFFSET(k) > KEY_OFFSET(replace_key))
+				goto check_failed;
+
+			/* We didn't find a key that we were supposed to */
+			if (KEY_START(k) > KEY_START(insert) + sectors_found)
+				goto check_failed;
+
+			if (!bch_bkey_equal_header(k, replace_key))
+				goto check_failed;
+
+			/* skip past gen */
+			offset <<= 8;
+
+			BUG_ON(!KEY_PTRS(replace_key));
+
+			for (i = 0; i < KEY_PTRS(replace_key); i++)
+				if (k->ptr[i] != replace_key->ptr[i] + offset)
+					goto check_failed;
+
+			sectors_found = KEY_OFFSET(k) - KEY_START(insert);
+		}
+
+		if (bkey_cmp(insert, k) < 0 &&
+		    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
+			/*
+			 * We overlapped in the middle of an existing key: that
+			 * means we have to split the old key. But we have to do
+			 * slightly different things depending on whether the
+			 * old key has been written out yet.
+			 */
+
+			struct bkey *top;
+
+			subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
+
+			if (bkey_written(b, k)) {
+				/*
+				 * We insert a new key to cover the top of the
+				 * old key, and the old key is modified in place
+				 * to represent the bottom split.
+				 *
+				 * It's completely arbitrary whether the new key
+				 * is the top or the bottom, but it has to match
+				 * up with what btree_sort_fixup() does - it
+				 * doesn't check for this kind of overlap, it
+				 * depends on us inserting a new key for the top
+				 * here.
+				 */
+				top = bch_bset_search(b, bset_tree_last(b),
+						      insert);
+				bch_bset_insert(b, top, k);
+			} else {
+				BKEY_PADDED(key) temp;
+				bkey_copy(&temp.key, k);
+				bch_bset_insert(b, k, &temp.key);
+				top = bkey_next(k);
+			}
+
+			bch_cut_front(insert, top);
+			bch_cut_back(&START_KEY(insert), k);
+			bch_bset_fix_invalidated_key(b, k);
+			goto out;
+		}
+
+		if (bkey_cmp(insert, k) < 0) {
+			bch_cut_front(insert, k);
+		} else {
+			if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
+				old_offset = KEY_START(insert);
+
+			if (bkey_written(b, k) &&
+			    bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
+				/*
+				 * Completely overwrote, so we don't have to
+				 * invalidate the binary search tree
+				 */
+				bch_cut_front(k, k);
+			} else {
+				__bch_cut_back(&START_KEY(insert), k);
+				bch_bset_fix_invalidated_key(b, k);
+			}
+		}
+
+		subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
+	}
+
+check_failed:
+	if (replace_key) {
+		if (!sectors_found) {
+			return true;
+		} else if (sectors_found < KEY_SIZE(insert)) {
+			SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
+				       (KEY_SIZE(insert) - sectors_found));
+			SET_KEY_SIZE(insert, sectors_found);
+		}
+	}
+out:
+	if (KEY_DIRTY(insert))
+		bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
+					     KEY_START(insert),
+					     KEY_SIZE(insert));
+
+	return false;
+}
+
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	char buf[80];
+
+	if (!KEY_SIZE(k))
+		return true;
+
+	if (KEY_SIZE(k) > KEY_OFFSET(k))
+		goto bad;
+
+	if (__ptr_invalid(b->c, k))
+		goto bad;
+
+	return false;
+bad:
+	bch_extent_to_text(buf, sizeof(buf), k);
+	cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+	return true;
+}
+
+static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
+				     unsigned ptr)
+{
+	struct bucket *g = PTR_BUCKET(b->c, k, ptr);
+	char buf[80];
+
+	if (mutex_trylock(&b->c->bucket_lock)) {
+		if (b->c->gc_mark_valid &&
+		    ((GC_MARK(g) != GC_MARK_DIRTY &&
+		      KEY_DIRTY(k)) ||
+		     GC_MARK(g) == GC_MARK_METADATA))
+			goto err;
+
+		if (g->prio == BTREE_PRIO)
+			goto err;
+
+		mutex_unlock(&b->c->bucket_lock);
+	}
+
+	return false;
+err:
+	mutex_unlock(&b->c->bucket_lock);
+	bch_extent_to_text(buf, sizeof(buf), k);
+	btree_bug(b,
+"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+		  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
+		  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+	return true;
+}
+
+static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	struct bucket *g;
+	unsigned i, stale;
+
+	if (!KEY_PTRS(k) ||
+	    bch_extent_invalid(bk, k))
+		return true;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (!ptr_available(b->c, k, i))
+			return true;
+
+	if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
+		return false;
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		g = PTR_BUCKET(b->c, k, i);
+		stale = ptr_stale(b->c, k, i);
+
+		btree_bug_on(stale > 96, b,
+			     "key too stale: %i, need_gc %u",
+			     stale, b->c->need_gc);
+
+		btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+			     b, "stale dirty pointer");
+
+		if (stale)
+			return true;
+
+		if (expensive_debug_checks(b->c) &&
+		    bch_extent_bad_expensive(b, k, i))
+			return true;
+	}
+
+	return false;
+}
+
+static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+{
+	return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+		~((uint64_t)1 << 63);
+}
+
+static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	unsigned i;
+
+	if (key_merging_disabled(b->c))
+		return false;
+
+	for (i = 0; i < KEY_PTRS(l); i++)
+		if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+		    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
+			return false;
+
+	/* Keys with no pointers aren't restricted to one bucket and could
+	 * overflow KEY_SIZE
+	 */
+	if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
+		SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
+		SET_KEY_SIZE(l, USHRT_MAX);
+
+		bch_cut_front(l, r);
+		return false;
+	}
+
+	if (KEY_CSUM(l)) {
+		if (KEY_CSUM(r))
+			l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+		else
+			SET_KEY_CSUM(l, 0);
+	}
+
+	SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+	SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+
+	return true;
+}
+
+const struct btree_keys_ops bch_extent_keys_ops = {
+	.sort_cmp	= bch_extent_sort_cmp,
+	.sort_fixup	= bch_extent_sort_fixup,
+	.insert_fixup	= bch_extent_insert_fixup,
+	.key_invalid	= bch_extent_invalid,
+	.key_bad	= bch_extent_bad,
+	.key_merge	= bch_extent_merge,
+	.key_to_text	= bch_extent_to_text,
+	.key_dump	= bch_bkey_dump,
+	.is_extents	= true,
+};
--- a/drivers/md/bcache/extents.h
+++ b/drivers/md/bcache/extents.h
@ -0,0 +1,13 @@
+#ifndef _BCACHE_EXTENTS_H
+#define _BCACHE_EXTENTS_H
+
+extern const struct btree_keys_ops bch_btree_keys_ops;
+extern const struct btree_keys_ops bch_extent_keys_ops;
+
+struct bkey;
+struct cache_set;
+
+void bch_extent_to_text(char *, size_t, const struct bkey *);
+bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+
+#endif /* _BCACHE_EXTENTS_H */
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@ -44,11 +44,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,

 	closure_init_stack(&cl);

-	pr_debug("reading %llu", (uint64_t) bucket);
+	pr_debug("reading %u", bucket_index);

 	while (offset < ca->sb.bucket_size) {
 reread:		left = ca->sb.bucket_size - offset;
-		len = min_t(unsigned, left, PAGE_SECTORS * 8);
+		len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);

 		bio_reset(bio);
 		bio->bi_iter.bi_sector	= bucket + offset;
@ -74,19 +74,28 @@ reread:		left = ca->sb.bucket_size - offset;
 			struct list_head *where;
 			size_t blocks, bytes = set_bytes(j);

-			if (j->magic != jset_magic(&ca->sb))
+			if (j->magic != jset_magic(&ca->sb)) {
+				pr_debug("%u: bad magic", bucket_index);
 				return ret;
+			}

-			if (bytes > left << 9)
+			if (bytes > left << 9 ||
+			    bytes > PAGE_SIZE << JSET_BITS) {
+				pr_info("%u: too big, %zu bytes, offset %u",
+					bucket_index, bytes, offset);
 				return ret;
+			}

 			if (bytes > len << 9)
 				goto reread;

-			if (j->csum != csum_set(j))
+			if (j->csum != csum_set(j)) {
+				pr_info("%u: bad csum, %zu bytes, offset %u",
+					bucket_index, bytes, offset);
 				return ret;
+			}

-			blocks = set_blocks(j, ca->set);
+			blocks = set_blocks(j, block_bytes(ca->set));

 			while (!list_empty(list)) {
 				i = list_first_entry(list,
@ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 		}

 		for (k = i->j.start;
-		     k < end(&i->j);
+		     k < bset_bkey_last(&i->j);
 		     k = bkey_next(k)) {
 			unsigned j;

@ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 				 n, i->j.seq - 1, start, end);

 		for (k = i->j.start;
-		     k < end(&i->j);
+		     k < bset_bkey_last(&i->j);
 		     k = bkey_next(k)) {
 			trace_bcache_journal_replay_key(k);

@ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl)
 	continue_at_nobarrier(cl, journal_write, system_wq);
 }

+static void journal_write_unlock(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+
+	c->journal.io_in_flight = 0;
+	spin_unlock(&c->journal.lock);
+}
+
 static void journal_write_unlocked(struct closure *cl)
 	__releases(c->journal.lock)
 {
@ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl)
 	struct cache *ca;
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
-	unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
+	unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
+		c->sb.block_size;

 	struct bio *bio;
 	struct bio_list list;
 	bio_list_init(&list);

 	if (!w->need_write) {
-		/*
-		 * XXX: have to unlock closure before we unlock journal lock,
-		 * else we race with bch_journal(). But this way we race
-		 * against cache set unregister. Doh.
-		 */
-		set_closure_fn(cl, NULL, NULL);
-		closure_sub(cl, CLOSURE_RUNNING + 1);
-		spin_unlock(&c->journal.lock);
-		return;
+		closure_return_with_destructor(cl, journal_write_unlock);
 	} else if (journal_full(&c->journal)) {
 		journal_reclaim(c);
 		spin_unlock(&c->journal.lock);
@ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
 		continue_at(cl, journal_write, system_wq);
 	}

-	c->journal.blocks_free -= set_blocks(w->data, c);
+	c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));

 	w->data->btree_level = c->root->level;

@ -653,17 +663,20 @@ static void journal_try_write(struct cache_set *c)

 	w->need_write = true;

-	if (closure_trylock(cl, &c->cl))
-		journal_write_unlocked(cl);
-	else
+	if (!c->journal.io_in_flight) {
+		c->journal.io_in_flight = 1;
+		closure_call(cl, journal_write_unlocked, NULL, &c->cl);
+	} else {
 		spin_unlock(&c->journal.lock);
 	}
+}

 static struct journal_write *journal_wait_for_write(struct cache_set *c,
 						    unsigned nkeys)
 {
 	size_t sectors;
 	struct closure cl;
+	bool wait = false;

 	closure_init_stack(&cl);

@ -673,15 +686,18 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
 		struct journal_write *w = c->journal.cur;

 		sectors = __set_blocks(w->data, w->data->keys + nkeys,
-				       c) * c->sb.block_size;
+				       block_bytes(c)) * c->sb.block_size;

 		if (sectors <= min_t(size_t,
 				     c->journal.blocks_free * c->sb.block_size,
 				     PAGE_SECTORS << JSET_BITS))
 			return w;

-		/* XXX: tracepoint */
+		if (wait)
+			closure_wait(&c->journal.wait, &cl);
+
 		if (!journal_full(&c->journal)) {
+			if (wait)
 				trace_bcache_journal_entry_full(c);

 			/*
@ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
 			 */
 			BUG_ON(!w->data->keys);

-			closure_wait(&w->wait, &cl);
 			journal_try_write(c); /* unlocks */
 		} else {
+			if (wait)
 				trace_bcache_journal_full(c);

-			closure_wait(&c->journal.wait, &cl);
 			journal_reclaim(c);
 			spin_unlock(&c->journal.lock);

@ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,

 		closure_sync(&cl);
 		spin_lock(&c->journal.lock);
+		wait = true;
 	}
 }

@ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c,

 	w = journal_wait_for_write(c, bch_keylist_nkeys(keys));

-	memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
+	memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
 	w->data->keys += bch_keylist_nkeys(keys);

 	ret = &fifo_back(&c->journal.pin);
@ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c)
 {
 	struct journal *j = &c->journal;

-	closure_init_unlocked(&j->io);
 	spin_lock_init(&j->lock);
 	INIT_DELAYED_WORK(&j->work, journal_write_work);

--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@ -104,6 +104,7 @@ struct journal {
 	/* used when waiting because the journal was full */
 	struct closure_waitlist	wait;
 	struct closure		io;
+	int			io_in_flight;
 	struct delayed_work	work;

 	/* Number of blocks free in the bucket(s) we're currently writing to */
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c)
 	for_each_cache(ca, c, i) {
 		unsigned sectors_to_move = 0;
 		unsigned reserve_sectors = ca->sb.bucket_size *
-			min(fifo_used(&ca->free), ca->free.size / 2);
+			fifo_used(&ca->free[RESERVE_MOVINGGC]);

 		ca->heap.used = 0;

--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@ -254,6 +254,24 @@ static void bch_data_insert_keys(struct closure *cl)
 	closure_return(cl);
 }

+static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
+			       struct cache_set *c)
+{
+	size_t oldsize = bch_keylist_nkeys(l);
+	size_t newsize = oldsize + u64s;
+
+	/*
+	 * The journalling code doesn't handle the case where the keys to insert
+	 * is bigger than an empty write: If we just return -ENOMEM here,
+	 * bio_insert() and bio_invalidate() will insert the keys created so far
+	 * and finish the rest when the keylist is empty.
+	 */
+	if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+		return -ENOMEM;
+
+	return __bch_keylist_realloc(l, u64s);
+}
+
 static void bch_data_invalidate(struct closure *cl)
 {
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
@ -266,7 +284,7 @@ static void bch_data_invalidate(struct closure *cl)
 		unsigned sectors = min(bio_sectors(bio),
 				       1U << (KEY_SIZE_BITS - 1));

-		if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
+		if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
 			goto out;

 		bio->bi_iter.bi_sector	+= sectors;
@ -356,7 +374,7 @@ static void bch_data_insert_start(struct closure *cl)

 		/* 1 for the device pointer and 1 for the chksum */
 		if (bch_keylist_realloc(&op->insert_keys,
-					1 + (op->csum ? 1 : 0),
+					3 + (op->csum ? 1 : 0),
 					op->c))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);

@ -596,14 +614,12 @@ struct search {
 	/* Stack frame for bio_complete */
 	struct closure		cl;

-	struct bcache_device	*d;
-
 	struct bbio		bio;
 	struct bio		*orig_bio;
 	struct bio		*cache_miss;
+	struct bcache_device	*d;

 	unsigned		insert_bio_sectors;
-
 	unsigned		recoverable:1;
 	unsigned		write:1;
 	unsigned		read_dirty_data:1;
@ -629,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error)

 	if (error)
 		s->iop.error = error;
-	else if (ptr_stale(s->iop.c, &b->key, 0)) {
+	else if (!KEY_DIRTY(&b->key) &&
+		 ptr_stale(s->iop.c, &b->key, 0)) {
 		atomic_long_inc(&s->iop.c->cache_read_races);
 		s->iop.error = -EINTR;
 	}
@ -710,8 +727,11 @@ static void cache_lookup(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, iop.cl);
 	struct bio *bio = &s->bio.bio;
+	int ret;

-	int ret = bch_btree_map_keys(&s->op, s->iop.c,
+	bch_btree_op_init(&s->op, -1);
+
+	ret = bch_btree_map_keys(&s->op, s->iop.c,
 				 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
 				 cache_lookup_fn, MAP_END_KEY);
 	if (ret == -EAGAIN)
@ -754,12 +774,12 @@ static void bio_complete(struct search *s)
 	}
 }

-static void do_bio_hook(struct search *s)
+static void do_bio_hook(struct search *s, struct bio *orig_bio)
 {
 	struct bio *bio = &s->bio.bio;

 	bio_init(bio);
-	__bio_clone_fast(bio, s->orig_bio);
+	__bio_clone_fast(bio, orig_bio);
 	bio->bi_end_io		= request_endio;
 	bio->bi_private		= &s->cl;

@ -778,26 +798,32 @@ static void search_free(struct closure *cl)
 	mempool_free(s, s->d->c->search);
 }

-static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
+static inline struct search *search_alloc(struct bio *bio,
+					  struct bcache_device *d)
 {
 	struct search *s;

 	s = mempool_alloc(d->c->search, GFP_NOIO);
-	memset(s, 0, offsetof(struct search, iop.insert_keys));

-	__closure_init(&s->cl, NULL);
+	closure_init(&s->cl, NULL);
+	do_bio_hook(s, bio);

-	s->iop.inode		= d->id;
-	s->iop.c		= d->c;
-	s->d			= d;
-	s->op.lock		= -1;
-	s->iop.write_point	= hash_long((unsigned long) current, 16);
 	s->orig_bio		= bio;
-	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
-	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+	s->cache_miss		= NULL;
+	s->d			= d;
 	s->recoverable		= 1;
+	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
+	s->read_dirty_data	= 0;
 	s->start_time		= jiffies;
-	do_bio_hook(s);
+
+	s->iop.c		= d->c;
+	s->iop.bio		= NULL;
+	s->iop.inode		= d->id;
+	s->iop.write_point	= hash_long((unsigned long) current, 16);
+	s->iop.write_prio	= 0;
+	s->iop.error		= 0;
+	s->iop.flags		= 0;
+	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;

 	return s;
 }
@ -843,7 +869,7 @@ static void cached_dev_read_error(struct closure *cl)
 		trace_bcache_read_retry(s->orig_bio);

 		s->iop.error = 0;
-		do_bio_hook(s);
+		do_bio_hook(s, s->orig_bio);

 		/* XXX: invalidate cache */

--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@ -13,6 +13,10 @@ struct data_insert_op {
 	uint16_t		write_prio;
 	short			error;

+	union {
+		uint16_t	flags;
+
+	struct {
 		unsigned	bypass:1;
 		unsigned	writeback:1;
 		unsigned	flush_journal:1;
@ -22,8 +26,9 @@ struct data_insert_op {
 		unsigned	replace_collision:1;

 		unsigned	insert_data_done:1;
+	};
+	};

-	/* Anything past this point won't get zeroed in search_alloc() */
 	struct keylist		insert_keys;
 	BKEY_PADDED(replace_key);
 };
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@ -9,6 +9,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 #include "request.h"
 #include "writeback.h"

@ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error)
 	struct cached_dev *dc = bio->bi_private;
 	/* XXX: error checking */

-	closure_put(&dc->sb_write.cl);
+	closure_put(&dc->sb_write);
 }

 static void __write_super(struct cache_sb *sb, struct bio *bio)
@ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
 	submit_bio(REQ_WRITE, bio);
 }

+static void bch_write_bdev_super_unlock(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+
+	up(&dc->sb_write_mutex);
+}
+
 void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 {
-	struct closure *cl = &dc->sb_write.cl;
+	struct closure *cl = &dc->sb_write;
 	struct bio *bio = &dc->sb_bio;

-	closure_lock(&dc->sb_write, parent);
+	down(&dc->sb_write_mutex);
+	closure_init(cl, parent);

 	bio_reset(bio);
 	bio->bi_bdev	= dc->bdev;
@ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 	closure_get(cl);
 	__write_super(&dc->sb, bio);

-	closure_return(cl);
+	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
 }

 static void write_super_endio(struct bio *bio, int error)
@ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error)
 	struct cache *ca = bio->bi_private;

 	bch_count_io_errors(ca, error, "writing superblock");
-	closure_put(&ca->set->sb_write.cl);
+	closure_put(&ca->set->sb_write);
+}
+
+static void bcache_write_super_unlock(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+
+	up(&c->sb_write_mutex);
 }

 void bcache_write_super(struct cache_set *c)
 {
-	struct closure *cl = &c->sb_write.cl;
+	struct closure *cl = &c->sb_write;
 	struct cache *ca;
 	unsigned i;

-	closure_lock(&c->sb_write, &c->cl);
+	down(&c->sb_write_mutex);
+	closure_init(cl, &c->cl);

 	c->sb.seq++;

@ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c)
 		__write_super(&ca->sb, bio);
 	}

-	closure_return(cl);
+	closure_return_with_destructor(cl, bcache_write_super_unlock);
 }

 /* UUID io */
@ -325,23 +342,31 @@ void bcache_write_super(struct cache_set *c)
 static void uuid_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
-	struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
+	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);

 	cache_set_err_on(error, c, "accessing uuids");
 	bch_bbio_free(bio, c);
 	closure_put(cl);
 }

+static void uuid_io_unlock(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+
+	up(&c->uuid_write_mutex);
+}
+
 static void uuid_io(struct cache_set *c, unsigned long rw,
 		    struct bkey *k, struct closure *parent)
 {
-	struct closure *cl = &c->uuid_write.cl;
+	struct closure *cl = &c->uuid_write;
 	struct uuid_entry *u;
 	unsigned i;
 	char buf[80];

 	BUG_ON(!parent);
-	closure_lock(&c->uuid_write, parent);
+	down(&c->uuid_write_mutex);
+	closure_init(cl, parent);

 	for (i = 0; i < KEY_PTRS(k); i++) {
 		struct bio *bio = bch_bbio_alloc(c);
@ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
 			break;
 	}

-	bch_bkey_to_text(buf, sizeof(buf), k);
+	bch_extent_to_text(buf, sizeof(buf), k);
 	pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);

 	for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
@ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
 				 u - c->uuids, u->uuid, u->label,
 				 u->first_reg, u->last_reg, u->invalidated);

-	closure_return(cl);
+	closure_return_with_destructor(cl, uuid_io_unlock);
 }

 static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
 {
 	struct bkey *k = &j->uuid_bucket;

-	if (bch_btree_ptr_invalid(c, k))
+	if (__bch_btree_ptr_invalid(c, k))
 		return "bad uuid pointer";

 	bkey_copy(&c->uuid_bucket, k);
@ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c)

 	lockdep_assert_held(&bch_register_lock);

-	if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
+	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
 		return 1;

 	SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca)
 	atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
 			&ca->meta_sectors_written);

-	pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
-		 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+	//pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
+	//	 fifo_used(&ca->free_inc), fifo_used(&ca->unused));

 	for (i = prio_buckets(ca) - 1; i >= 0; --i) {
 		long bucket;
@ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca)
 		p->magic	= pset_magic(&ca->sb);
 		p->csum		= bch_crc64(&p->magic, bucket_bytes(ca) - 8);

-		bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
+		bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
 		BUG_ON(bucket == -1);

 		mutex_unlock(&ca->set->bucket_lock);
@ -1098,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
 	set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
 	kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
 	INIT_WORK(&dc->detach, cached_dev_detach_finish);
-	closure_init_unlocked(&dc->sb_write);
+	sema_init(&dc->sb_write_mutex, 1);
 	INIT_LIST_HEAD(&dc->io_lru);
 	spin_lock_init(&dc->io_lock);
 	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
@ -1110,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
 		hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
 	}

+	dc->disk.stripe_size = q->limits.io_opt >> 9;
+
+	if (dc->disk.stripe_size)
+		dc->partial_stripes_expensive =
+			q->limits.raid_partial_stripes_expensive;
+
 	ret = bcache_device_init(&dc->disk, block_size,
 			 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
 	if (ret)
@ -1321,8 +1352,8 @@ static void cache_set_free(struct closure *cl)
 		if (ca)
 			kobject_put(&ca->kobj);

+	bch_bset_sort_state_free(&c->sort);
 	free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
-	free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));

 	if (c->bio_split)
 		bioset_free(c->bio_split);
@ -1447,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	c->block_bits		= ilog2(sb->block_size);
 	c->nr_uuids		= bucket_bytes(c) / sizeof(struct uuid_entry);

-	c->btree_pages		= c->sb.bucket_size / PAGE_SECTORS;
+	c->btree_pages		= bucket_pages(c);
 	if (c->btree_pages > BTREE_MAX_PAGES)
 		c->btree_pages = max_t(int, c->btree_pages / 4,
 				       BTREE_MAX_PAGES);

-	c->sort_crit_factor = int_sqrt(c->btree_pages);
-
-	closure_init_unlocked(&c->sb_write);
+	sema_init(&c->sb_write_mutex, 1);
 	mutex_init(&c->bucket_lock);
 	init_waitqueue_head(&c->try_wait);
 	init_waitqueue_head(&c->bucket_wait);
-	closure_init_unlocked(&c->uuid_write);
-	mutex_init(&c->sort_lock);
+	sema_init(&c->uuid_write_mutex, 1);

-	spin_lock_init(&c->sort_time.lock);
 	spin_lock_init(&c->btree_gc_time.lock);
 	spin_lock_init(&c->btree_split_time.lock);
 	spin_lock_init(&c->btree_read_time.lock);
@ -1489,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 				bucket_pages(c))) ||
 	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
 	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
-	    !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    bch_journal_alloc(c) ||
 	    bch_btree_cache_alloc(c) ||
-	    bch_open_buckets_alloc(c))
+	    bch_open_buckets_alloc(c) ||
+	    bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
 		goto err;

 	c->congested_read_threshold_us	= 2000;
@ -1549,7 +1576,7 @@ static void run_cache_set(struct cache_set *c)
 		k = &j->btree_root;

 		err = "bad btree root";
-		if (bch_btree_ptr_invalid(c, k))
+		if (__bch_btree_ptr_invalid(c, k))
 			goto err;

 		err = "error reading btree root";
@ -1743,6 +1770,7 @@ err:
 void bch_cache_release(struct kobject *kobj)
 {
 	struct cache *ca = container_of(kobj, struct cache, kobj);
+	unsigned i;

 	if (ca->set)
 		ca->set->cache[ca->sb.nr_this_dev] = NULL;
@ -1756,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj)
 	free_heap(&ca->heap);
 	free_fifo(&ca->unused);
 	free_fifo(&ca->free_inc);
-	free_fifo(&ca->free);
+
+	for (i = 0; i < RESERVE_NR; i++)
+		free_fifo(&ca->free[i]);

 	if (ca->sb_bio.bi_inline_vecs[0].bv_page)
 		put_page(ca->sb_bio.bi_io_vec[0].bv_page);
@ -1782,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 	ca->journal.bio.bi_max_vecs = 8;
 	ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;

-	free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
-	free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
+	free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;

-	if (!init_fifo(&ca->free,	free, GFP_KERNEL) ||
+	if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
 	    !init_fifo(&ca->free_inc,	free << 2, GFP_KERNEL) ||
 	    !init_fifo(&ca->unused,	free << 2, GFP_KERNEL) ||
 	    !init_heap(&ca->heap,	free << 3, GFP_KERNEL) ||
@ -2030,6 +2062,7 @@ static void bcache_exit(void)
 		kobject_put(bcache_kobj);
 	if (bcache_wq)
 		destroy_workqueue(bcache_wq);
+	if (bcache_major)
 		unregister_blkdev(bcache_major, "bcache");
 	unregister_reboot_notifier(&reboot);
 }
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test);
 rw_attribute(key_merging_disabled);
 rw_attribute(gc_always_rewrite);
 rw_attribute(expensive_debug_checks);
-rw_attribute(freelist_percent);
 rw_attribute(cache_replacement_policy);
 rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
@ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = {
 };
 KTYPE(bch_flash_dev);

+struct bset_stats_op {
+	struct btree_op op;
+	size_t nodes;
+	struct bset_stats stats;
+};
+
+static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
+{
+	struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
+
+	op->nodes++;
+	bch_btree_keys_stats(&b->keys, &op->stats);
+
+	return MAP_CONTINUE;
+}
+
+int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+	struct bset_stats_op op;
+	int ret;
+
+	memset(&op, 0, sizeof(op));
+	bch_btree_op_init(&op.op, -1);
+
+	ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats);
+	if (ret < 0)
+		return ret;
+
+	return snprintf(buf, PAGE_SIZE,
+			"btree nodes:		%zu\n"
+			"written sets:		%zu\n"
+			"unwritten sets:		%zu\n"
+			"written key bytes:	%zu\n"
+			"unwritten key bytes:	%zu\n"
+			"floats:			%zu\n"
+			"failed:			%zu\n",
+			op.nodes,
+			op.stats.sets_written, op.stats.sets_unwritten,
+			op.stats.bytes_written, op.stats.bytes_unwritten,
+			op.stats.floats, op.stats.failed);
+}
+
 SHOW(__bch_cache_set)
 {
 	unsigned root_usage(struct cache_set *c)
@ -419,7 +460,7 @@ lock_root:
 			rw_lock(false, b, b->level);
 		} while (b != c->root);

-		for_each_key_filter(b, k, &iter, bch_ptr_bad)
+		for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
 			bytes += bkey_bytes(k);

 		rw_unlock(false, b);
@ -434,7 +475,7 @@ lock_root:

 		mutex_lock(&c->bucket_lock);
 		list_for_each_entry(b, &c->btree_cache, list)
-			ret += 1 << (b->page_order + PAGE_SHIFT);
+			ret += 1 << (b->keys.page_order + PAGE_SHIFT);

 		mutex_unlock(&c->bucket_lock);
 		return ret;
@ -491,7 +532,7 @@ lock_root:

 	sysfs_print_time_stats(&c->btree_gc_time,	btree_gc, sec, ms);
 	sysfs_print_time_stats(&c->btree_split_time,	btree_split, sec, us);
-	sysfs_print_time_stats(&c->sort_time,		btree_sort, ms, us);
+	sysfs_print_time_stats(&c->sort.time,		btree_sort, ms, us);
 	sysfs_print_time_stats(&c->btree_read_time,	btree_read, ms, us);
 	sysfs_print_time_stats(&c->try_harder_time,	try_harder, ms, us);

@ -711,9 +752,6 @@ SHOW(__bch_cache)
 	sysfs_print(io_errors,
 		    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);

-	sysfs_print(freelist_percent, ca->free.size * 100 /
-		    ((size_t) ca->sb.nbuckets));
-
 	if (attr == &sysfs_cache_replacement_policy)
 		return bch_snprint_string_list(buf, PAGE_SIZE,
 					       cache_replacement_policies,
@ -820,32 +858,6 @@ STORE(__bch_cache)
 		}
 	}

-	if (attr == &sysfs_freelist_percent) {
-		DECLARE_FIFO(long, free);
-		long i;
-		size_t p = strtoul_or_return(buf);
-
-		p = clamp_t(size_t,
-			    ((size_t) ca->sb.nbuckets * p) / 100,
-			    roundup_pow_of_two(ca->sb.nbuckets) >> 9,
-			    ca->sb.nbuckets / 2);
-
-		if (!init_fifo_exact(&free, p, GFP_KERNEL))
-			return -ENOMEM;
-
-		mutex_lock(&ca->set->bucket_lock);
-
-		fifo_move(&free, &ca->free);
-		fifo_swap(&free, &ca->free);
-
-		mutex_unlock(&ca->set->bucket_lock);
-
-		while (fifo_pop(&free, i))
-			atomic_dec(&ca->buckets[i].pin);
-
-		free_fifo(&free);
-	}
-
 	if (attr == &sysfs_clear_stats) {
 		atomic_long_set(&ca->sectors_written, 0);
 		atomic_long_set(&ca->btree_sectors_written, 0);
@ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = {
 	&sysfs_metadata_written,
 	&sysfs_io_errors,
 	&sysfs_clear_stats,
-	&sysfs_freelist_percent,
 	&sysfs_cache_replacement_policy,
 	NULL
 };
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@ -2,6 +2,7 @@
 #ifndef _BCACHE_UTIL_H
 #define _BCACHE_UTIL_H

+#include <linux/blkdev.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/llist.h>
@ -17,11 +18,13 @@ struct closure;

 #ifdef CONFIG_BCACHE_DEBUG

+#define EBUG_ON(cond)			BUG_ON(cond)
 #define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
 #define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)

 #else /* DEBUG */

+#define EBUG_ON(cond)			do { if (cond); } while (0)
 #define atomic_dec_bug(v)	atomic_dec(v)
 #define atomic_inc_bug(v, i)	atomic_inc(v)

@ -391,6 +394,11 @@ struct time_stats {

 void bch_time_stats_update(struct time_stats *stats, uint64_t time);

+static inline unsigned local_clock_us(void)
+{
+	return local_clock() >> 10;
+}
+
 #define NSEC_PER_ns			1L
 #define NSEC_PER_us			NSEC_PER_USEC
 #define NSEC_PER_ms			NSEC_PER_MSEC
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@ -6103,6 +6103,7 @@ static int run(struct mddev *mddev)
 		blk_queue_io_min(mddev->queue, chunk_size);
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks - conf->max_degraded));
+		mddev->queue->limits.raid_partial_stripes_expensive = 1;
 		/*
 		 * We can only discard a whole stripe. It doesn't make sense to
 		 * discard data disk but write parity disk
--- a/fs/buffer.c
+++ b/fs/buffer.c
@ -1312,7 +1312,7 @@ static void bh_lru_install(struct buffer_head *bh)
 		}
 		while (out < BH_LRU_SIZE)
 			bhs[out++] = NULL;
-		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
+		memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
 	}
 	bh_lru_unlock();

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@ -291,6 +291,7 @@ struct queue_limits {
 	unsigned char		discard_misaligned;
 	unsigned char		cluster;
 	unsigned char		discard_zeroes_data;
+	unsigned char		raid_partial_stripes_expensive;
 };

 struct request_queue {
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@ -247,7 +247,7 @@ TRACE_EVENT(bcache_btree_write,
 	TP_fast_assign(
 		__entry->bucket	= PTR_BUCKET_NR(b->c, &b->key, 0);
 		__entry->block	= b->written;
-		__entry->keys	= b->sets[b->nsets].data->keys;
+		__entry->keys	= b->keys.set[b->keys.nsets].data->keys;
 	),

 	TP_printk("bucket %zu", __entry->bucket)
@ -411,7 +411,7 @@ TRACE_EVENT(bcache_alloc_invalidate,
 	),

 	TP_fast_assign(
-		__entry->free		= fifo_used(&ca->free);
+		__entry->free		= fifo_used(&ca->free[RESERVE_NONE]);
 		__entry->free_inc	= fifo_used(&ca->free_inc);
 		__entry->free_inc_size	= ca->free_inc.size;
 		__entry->unused		= fifo_used(&ca->unused);
@ -422,8 +422,8 @@ TRACE_EVENT(bcache_alloc_invalidate,
 );

 TRACE_EVENT(bcache_alloc_fail,
-	TP_PROTO(struct cache *ca),
-	TP_ARGS(ca),
+	TP_PROTO(struct cache *ca, unsigned reserve),
+	TP_ARGS(ca, reserve),

 	TP_STRUCT__entry(
 		__field(unsigned,	free			)
@ -433,7 +433,7 @@ TRACE_EVENT(bcache_alloc_fail,
 	),

 	TP_fast_assign(
-		__entry->free		= fifo_used(&ca->free);
+		__entry->free		= fifo_used(&ca->free[reserve]);
 		__entry->free_inc	= fifo_used(&ca->free_inc);
 		__entry->unused		= fifo_used(&ca->unused);
 		__entry->blocked	= atomic_read(&ca->set->prio_blocked);
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@ -39,6 +39,7 @@ static inline void SET_##name(struct bkey *k, unsigned i, __u64 v)	\
 }

 #define KEY_SIZE_BITS		16
+#define KEY_MAX_U64S		8

 KEY_FIELD(KEY_PTRS,	high, 60, 3)
 KEY_FIELD(HEADER_SIZE,	high, 58, 2)
@ -118,7 +119,7 @@ static inline struct bkey *bkey_next(const struct bkey *k)
 	return (struct bkey *) (d + bkey_u64s(k));
 }

-static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys)
+static inline struct bkey *bkey_idx(const struct bkey *k, unsigned nr_keys)
 {
 	__u64 *d = (void *) k;
 	return (struct bkey *) (d + nr_keys);
--- a/include/uapi/linux/fd.h
+++ b/include/uapi/linux/fd.h
@ -185,7 +185,8 @@ enum {
 				 * to clear media change status */
 	FD_UNUSED_BIT,
 	FD_DISK_CHANGED_BIT,	/* disk has been changed since last i/o */
-	FD_DISK_WRITABLE_BIT	/* disk is writable */
+	FD_DISK_WRITABLE_BIT,	/* disk is writable */
+	FD_OPEN_SHOULD_FAIL_BIT
 };

 #define FDSETDRVPRM _IOW(2, 0x90, struct floppy_drive_params)