From 95f37eb52e18879a1b16e51b972d992b39e50a81 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:35 +0200 Subject: [PATCH 001/270] ARM: OMAP2+: fix bogus MMC GPIO labels on Nokia N8x0 The GPIO bank width is 32 on OMAP2, so all labels are incorrect. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-2-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-n8x0.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index 31755a378c73..3e48f34016c1 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -144,8 +144,7 @@ static struct gpiod_lookup_table nokia8xx_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { /* Slot switch, GPIO 96 */ - GPIO_LOOKUP("gpio-80-111", 16, - "switch", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-96-127", 0, "switch", GPIO_ACTIVE_HIGH), { } }, }; @@ -154,11 +153,9 @@ static struct gpiod_lookup_table nokia810_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { /* Slot index 1, VSD power, GPIO 23 */ - GPIO_LOOKUP_IDX("gpio-16-31", 7, - "vsd", 1, GPIO_ACTIVE_HIGH), + GPIO_LOOKUP_IDX("gpio-0-31", 23, "vsd", 1, GPIO_ACTIVE_HIGH), /* Slot index 1, VIO power, GPIO 9 */ - GPIO_LOOKUP_IDX("gpio-0-15", 9, - "vio", 1, GPIO_ACTIVE_HIGH), + GPIO_LOOKUP_IDX("gpio-0-31", 9, "vio", 1, GPIO_ACTIVE_HIGH), { } }, }; From 480d44d0820dd5ae043dc97c0b46dabbe53cb1cf Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:36 +0200 Subject: [PATCH 002/270] ARM: OMAP2+: fix N810 MMC gpiod table Trying to append a second table for the same dev_id doesn't seem to work. The second table is just silently ignored. As a result eMMC GPIOs are not present. Fix by using separate tables for N800 and N810. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-3-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-n8x0.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index 3e48f34016c1..c933a91751e4 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -140,7 +140,7 @@ static int slot1_cover_open; static int slot2_cover_open; static struct device *mmc_device; -static struct gpiod_lookup_table nokia8xx_mmc_gpio_table = { +static struct gpiod_lookup_table nokia800_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { /* Slot switch, GPIO 96 */ @@ -152,6 +152,8 @@ static struct gpiod_lookup_table nokia8xx_mmc_gpio_table = { static struct gpiod_lookup_table nokia810_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { + /* Slot switch, GPIO 96 */ + GPIO_LOOKUP("gpio-96-127", 0, "switch", GPIO_ACTIVE_HIGH), /* Slot index 1, VSD power, GPIO 23 */ GPIO_LOOKUP_IDX("gpio-0-31", 23, "vsd", 1, GPIO_ACTIVE_HIGH), /* Slot index 1, VIO power, GPIO 9 */ @@ -412,8 +414,6 @@ static struct omap_mmc_platform_data *mmc_data[OMAP24XX_NR_MMC]; static void __init n8x0_mmc_init(void) { - gpiod_add_lookup_table(&nokia8xx_mmc_gpio_table); - if (board_is_n810()) { mmc1_data.slots[0].name = "external"; @@ -426,6 +426,8 @@ static void __init n8x0_mmc_init(void) mmc1_data.slots[1].name = "internal"; mmc1_data.slots[1].ban_openended = 1; gpiod_add_lookup_table(&nokia810_mmc_gpio_table); + } else { + gpiod_add_lookup_table(&nokia800_mmc_gpio_table); } mmc1_data.nr_slots = 2; From d4debbcbffa45c3de5df0040af2eea74a9e794a3 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:37 +0200 Subject: [PATCH 003/270] mmc: omap: fix broken slot switch lookup The lookup is done before host->dev is initialized. It will always just fail silently, and the MMC behaviour is totally unpredictable as the switch is left in an undefined state. Fix that. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-4-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- drivers/mmc/host/omap.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 9fb8995b43a1..aa40e1a9dc29 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1384,13 +1384,6 @@ static int mmc_omap_probe(struct platform_device *pdev) if (IS_ERR(host->virt_base)) return PTR_ERR(host->virt_base); - host->slot_switch = gpiod_get_optional(host->dev, "switch", - GPIOD_OUT_LOW); - if (IS_ERR(host->slot_switch)) - return dev_err_probe(host->dev, PTR_ERR(host->slot_switch), - "error looking up slot switch GPIO\n"); - - INIT_WORK(&host->slot_release_work, mmc_omap_slot_release_work); INIT_WORK(&host->send_stop_work, mmc_omap_send_stop_work); @@ -1409,6 +1402,12 @@ static int mmc_omap_probe(struct platform_device *pdev) host->dev = &pdev->dev; platform_set_drvdata(pdev, host); + host->slot_switch = gpiod_get_optional(host->dev, "switch", + GPIOD_OUT_LOW); + if (IS_ERR(host->slot_switch)) + return dev_err_probe(host->dev, PTR_ERR(host->slot_switch), + "error looking up slot switch GPIO\n"); + host->id = pdev->id; host->irq = irq; host->phys_base = res->start; From f6862c7f156d04f81c38467e1c304b7e9517e810 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:38 +0200 Subject: [PATCH 004/270] mmc: omap: fix deferred probe After a deferred probe, GPIO descriptor lookup will fail with EBUSY. Fix by using managed descriptors. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-5-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- drivers/mmc/host/omap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index aa40e1a9dc29..50408771ae01 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1259,18 +1259,18 @@ static int mmc_omap_new_slot(struct mmc_omap_host *host, int id) slot->pdata = &host->pdata->slots[id]; /* Check for some optional GPIO controls */ - slot->vsd = gpiod_get_index_optional(host->dev, "vsd", - id, GPIOD_OUT_LOW); + slot->vsd = devm_gpiod_get_index_optional(host->dev, "vsd", + id, GPIOD_OUT_LOW); if (IS_ERR(slot->vsd)) return dev_err_probe(host->dev, PTR_ERR(slot->vsd), "error looking up VSD GPIO\n"); - slot->vio = gpiod_get_index_optional(host->dev, "vio", - id, GPIOD_OUT_LOW); + slot->vio = devm_gpiod_get_index_optional(host->dev, "vio", + id, GPIOD_OUT_LOW); if (IS_ERR(slot->vio)) return dev_err_probe(host->dev, PTR_ERR(slot->vio), "error looking up VIO GPIO\n"); - slot->cover = gpiod_get_index_optional(host->dev, "cover", - id, GPIOD_IN); + slot->cover = devm_gpiod_get_index_optional(host->dev, "cover", + id, GPIOD_IN); if (IS_ERR(slot->cover)) return dev_err_probe(host->dev, PTR_ERR(slot->cover), "error looking up cover switch GPIO\n"); @@ -1402,8 +1402,8 @@ static int mmc_omap_probe(struct platform_device *pdev) host->dev = &pdev->dev; platform_set_drvdata(pdev, host); - host->slot_switch = gpiod_get_optional(host->dev, "switch", - GPIOD_OUT_LOW); + host->slot_switch = devm_gpiod_get_optional(host->dev, "switch", + GPIOD_OUT_LOW); if (IS_ERR(host->slot_switch)) return dev_err_probe(host->dev, PTR_ERR(host->slot_switch), "error looking up slot switch GPIO\n"); From 894ad61b85d6ba8efd4274aa8719d9ff1c89ea54 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:39 +0200 Subject: [PATCH 005/270] mmc: omap: restore original power up/down steps Commit e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") moved Nokia N810 MMC power up/down from the board file into the MMC driver. The change removed some delays, and ordering without a valid reason. Restore power up/down to match the original code. This matters only on N810 where the 2nd GPIO is in use. Other boards will see an additional delay but that should be a lesser concern than omitting delays altogether. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-6-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- drivers/mmc/host/omap.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 50408771ae01..13fa8588e38c 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1119,10 +1119,25 @@ static void mmc_omap_set_power(struct mmc_omap_slot *slot, int power_on, host = slot->host; - if (slot->vsd) - gpiod_set_value(slot->vsd, power_on); - if (slot->vio) - gpiod_set_value(slot->vio, power_on); + if (power_on) { + if (slot->vsd) { + gpiod_set_value(slot->vsd, power_on); + msleep(1); + } + if (slot->vio) { + gpiod_set_value(slot->vio, power_on); + msleep(1); + } + } else { + if (slot->vio) { + gpiod_set_value(slot->vio, power_on); + msleep(50); + } + if (slot->vsd) { + gpiod_set_value(slot->vsd, power_on); + msleep(50); + } + } if (slot->pdata->set_power != NULL) slot->pdata->set_power(mmc_dev(slot->mmc), slot->id, power_on, From 4421405e3634a3189b541cf1e34598e44260720d Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:16:56 +0200 Subject: [PATCH 006/270] ARM: OMAP2+: fix USB regression on Nokia N8x0 GPIO chip labels are wrong for OMAP2, so the USB does not work. Fix. Fixes: 8e0285ab95a9 ("ARM/musb: omap2: Remove global GPIO numbers from TUSB6010") Signed-off-by: Aaro Koskinen Reviewed-by: Linus Walleij Message-ID: <20240223181656.1099845-1-aaro.koskinen@iki.fi> Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-n8x0.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index c933a91751e4..ff2a4a4d8220 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -79,10 +79,8 @@ static struct musb_hdrc_platform_data tusb_data = { static struct gpiod_lookup_table tusb_gpio_table = { .dev_id = "musb-tusb", .table = { - GPIO_LOOKUP("gpio-0-15", 0, "enable", - GPIO_ACTIVE_HIGH), - GPIO_LOOKUP("gpio-48-63", 10, "int", - GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-0-31", 0, "enable", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-32-63", 26, "int", GPIO_ACTIVE_HIGH), { } }, }; From fcf3f7e2fc8a53a6140beee46ec782a4c88e4744 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 8 Mar 2024 17:37:26 +0800 Subject: [PATCH 007/270] raid1: fix use-after-free for original bio in raid1_write_request() r1_bio->bios[] is used to record new bios that will be issued to underlying disks, however, in raid1_write_request(), r1_bio->bios[] will set to the original bio temporarily. Meanwhile, if blocked rdev is set, free_r1bio() will be called causing that all r1_bio->bios[] to be freed: raid1_write_request() r1_bio = alloc_r1bio(mddev, bio); -> r1_bio->bios[] is NULL for (i = 0; i < disks; i++) -> for each rdev in conf // first rdev is normal r1_bio->bios[0] = bio; -> set to original bio // second rdev is blocked if (test_bit(Blocked, &rdev->flags)) break if (blocked_rdev) free_r1bio() put_all_bios() bio_put(r1_bio->bios[0]) -> original bio is freed Test scripts: mdadm -CR /dev/md0 -l1 -n4 /dev/sd[abcd] --assume-clean fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 \ -iodepth=128 -name=test -direct=1 echo blocked > /sys/block/md0/md/rd2/state Test result: BUG bio-264 (Not tainted): Object already free ----------------------------------------------------------------------------- Allocated in mempool_alloc_slab+0x24/0x50 age=1 cpu=1 pid=869 kmem_cache_alloc+0x324/0x480 mempool_alloc_slab+0x24/0x50 mempool_alloc+0x6e/0x220 bio_alloc_bioset+0x1af/0x4d0 blkdev_direct_IO+0x164/0x8a0 blkdev_write_iter+0x309/0x440 aio_write+0x139/0x2f0 io_submit_one+0x5ca/0xb70 __do_sys_io_submit+0x86/0x270 __x64_sys_io_submit+0x22/0x30 do_syscall_64+0xb1/0x210 entry_SYSCALL_64_after_hwframe+0x6c/0x74 Freed in mempool_free_slab+0x1f/0x30 age=1 cpu=1 pid=869 kmem_cache_free+0x28c/0x550 mempool_free_slab+0x1f/0x30 mempool_free+0x40/0x100 bio_free+0x59/0x80 bio_put+0xf0/0x220 free_r1bio+0x74/0xb0 raid1_make_request+0xadf/0x1150 md_handle_request+0xc7/0x3b0 md_submit_bio+0x76/0x130 __submit_bio+0xd8/0x1d0 submit_bio_noacct_nocheck+0x1eb/0x5c0 submit_bio_noacct+0x169/0xd40 submit_bio+0xee/0x1d0 blkdev_direct_IO+0x322/0x8a0 blkdev_write_iter+0x309/0x440 aio_write+0x139/0x2f0 Since that bios for underlying disks are not allocated yet, fix this problem by using mempool_free() directly to free the r1_bio. Fixes: 992db13a4aee ("md/raid1: free the r1bio before waiting for blocked rdev") Cc: stable@vger.kernel.org # v6.6+ Reported-by: Coly Li Signed-off-by: Yu Kuai Tested-by: Coly Li Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240308093726.1047420-1-yukuai1@huaweicloud.com --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index be8ac24f50b6..7b8a71ca66dd 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1558,7 +1558,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); - free_r1bio(r1_bio); + mempool_free(r1_bio, &conf->r1bio_pool); allow_barrier(conf, bio->bi_iter.bi_sector); if (bio->bi_opf & REQ_NOWAIT) { From e249884e106b81c34f8050d23935ffc12623843f Mon Sep 17 00:00:00 2001 From: Erni Sri Satya Vennela Date: Thu, 21 Mar 2024 01:22:05 -0700 Subject: [PATCH 008/270] x86/hyperv: Cosmetic changes for hv_apic.c Fix issues reported by checkpatch.pl script for hv_apic.c file - Alignment should match open parenthesis - Remove unnecessary parenthesis No functional changes intended. Signed-off-by: Erni Sri Satya Vennela Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/1711009325-21894-1-git-send-email-ernis@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711009325-21894-1-git-send-email-ernis@linux.microsoft.com> --- arch/x86/hyperv/hv_apic.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5fc45543e955..0569f579338b 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -105,7 +105,7 @@ static bool cpu_is_self(int cpu) * IPI implementation on Hyper-V. */ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { struct hv_send_ipi_ex *ipi_arg; unsigned long flags; @@ -132,8 +132,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask, - exclude_self ? cpu_is_self : NULL); + nr_bank = cpumask_to_vpset_skip(&ipi_arg->vp_set, mask, + exclude_self ? cpu_is_self : NULL); /* * 'nr_bank <= 0' means some CPUs in cpumask can't be @@ -147,7 +147,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, } status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, - ipi_arg, NULL); + ipi_arg, NULL); ipi_mask_ex_done: local_irq_restore(flags); @@ -155,7 +155,7 @@ ipi_mask_ex_done: } static bool __send_ipi_mask(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { int cur_cpu, vcpu, this_cpu = smp_processor_id(); struct hv_send_ipi ipi_arg; @@ -181,7 +181,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, return false; } - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; /* @@ -218,7 +218,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, } status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, - ipi_arg.cpu_mask); + ipi_arg.cpu_mask); return hv_result_success(status); do_ex_hypercall: @@ -241,7 +241,7 @@ static bool __send_ipi_one(int cpu, int vector) return false; } - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; if (vp >= 64) From 1f1dc442c57ec61c08d21d47e4c5b4f16446fe00 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 22 Mar 2024 14:10:26 -0700 Subject: [PATCH 009/270] mshyperv: Introduce hv_numa_node_to_pxm_info() Factor out logic for converting numa node to hv_proximity_domain_info into a helper function. Change hv_proximity_domain_info to a struct to improve readability. While at it, rename hv_add_logical_processor_* structs to the correct hv_input_/hv_output_ prefix, and remove the flags field which is not present in the ABI. Signed-off-by: Nuno Das Neves Reviewed-by: Wei Liu Link: https://lore.kernel.org/r/1711141826-9458-1-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711141826-9458-1-git-send-email-nunodasneves@linux.microsoft.com> --- arch/x86/hyperv/hv_proc.c | 22 ++++------------------ include/asm-generic/hyperv-tlfs.h | 19 +++++++------------ include/asm-generic/mshyperv.h | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 30 deletions(-) diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c index 68a0843d4750..3fa1f2ee7b0d 100644 --- a/arch/x86/hyperv/hv_proc.c +++ b/arch/x86/hyperv/hv_proc.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -116,12 +115,11 @@ free_buf: int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) { - struct hv_add_logical_processor_in *input; - struct hv_add_logical_processor_out *output; + struct hv_input_add_logical_processor *input; + struct hv_output_add_logical_processor *output; u64 status; unsigned long flags; int ret = HV_STATUS_SUCCESS; - int pxm = node_to_pxm(node); /* * When adding a logical processor, the hypervisor may return @@ -137,11 +135,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) input->lp_index = lp_index; input->apic_id = apic_id; - input->flags = 0; - input->proximity_domain_info.domain_id = pxm; - input->proximity_domain_info.flags.reserved = 0; - input->proximity_domain_info.flags.proximity_info_valid = 1; - input->proximity_domain_info.flags.proximity_preferred = 1; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR, input, output); local_irq_restore(flags); @@ -166,7 +160,6 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) u64 status; unsigned long irq_flags; int ret = HV_STATUS_SUCCESS; - int pxm = node_to_pxm(node); /* Root VPs don't seem to need pages deposited */ if (partition_id != hv_current_partition_id) { @@ -185,14 +178,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) input->vp_index = vp_index; input->flags = flags; input->subnode_type = HvSubnodeAny; - if (node != NUMA_NO_NODE) { - input->proximity_domain_info.domain_id = pxm; - input->proximity_domain_info.flags.reserved = 0; - input->proximity_domain_info.flags.proximity_info_valid = 1; - input->proximity_domain_info.flags.proximity_preferred = 1; - } else { - input->proximity_domain_info.as_uint64 = 0; - } + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); local_irq_restore(irq_flags); diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index fdac4a1714ec..69f3f68dc249 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -512,13 +512,9 @@ struct hv_proximity_domain_flags { u32 proximity_info_valid : 1; } __packed; -/* Not a union in windows but useful for zeroing */ -union hv_proximity_domain_info { - struct { - u32 domain_id; - struct hv_proximity_domain_flags flags; - }; - u64 as_uint64; +struct hv_proximity_domain_info { + u32 domain_id; + struct hv_proximity_domain_flags flags; } __packed; struct hv_lp_startup_status { @@ -532,14 +528,13 @@ struct hv_lp_startup_status { } __packed; /* HvAddLogicalProcessor hypercall */ -struct hv_add_logical_processor_in { +struct hv_input_add_logical_processor { u32 lp_index; u32 apic_id; - union hv_proximity_domain_info proximity_domain_info; - u64 flags; + struct hv_proximity_domain_info proximity_domain_info; } __packed; -struct hv_add_logical_processor_out { +struct hv_output_add_logical_processor { struct hv_lp_startup_status startup_status; } __packed; @@ -560,7 +555,7 @@ struct hv_create_vp { u8 padding[3]; u8 subnode_type; u64 subnode_id; - union hv_proximity_domain_info proximity_domain_info; + struct hv_proximity_domain_info proximity_domain_info; u64 flags; } __packed; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 430f0ae0dde2..cfb0b51a0998 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,19 @@ extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); bool hv_isolation_type_snp(void); bool hv_isolation_type_tdx(void); +static inline struct hv_proximity_domain_info hv_numa_node_to_pxm_info(int node) +{ + struct hv_proximity_domain_info pxm_info = {}; + + if (node != NUMA_NO_NODE) { + pxm_info.domain_id = node_to_pxm(node); + pxm_info.flags.proximity_info_valid = 1; + pxm_info.flags.proximity_preferred = 1; + } + + return pxm_info; +} + /* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */ static inline int hv_result(u64 status) { From 1a4bd2b128fb5ca62e4d1c5ca298d3d06b9c1e8e Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Mon, 11 Mar 2024 12:07:00 +0100 Subject: [PATCH 010/270] firmware: arm_ffa: Fix the partition ID check in ffa_notification_info_get() FFA_NOTIFICATION_INFO_GET retrieves information about pending notifications. Notifications can be either global or per VCPU. Global notifications are reported with the partition ID only in the list of endpoints with pending notifications. ffa_notification_info_get() incorrectly expect no ID at all for global notifications. Fix this by checking for ID = 1 instead of ID = 0. Fixes: 3522be48d82b ("firmware: arm_ffa: Implement the NOTIFICATION_INFO_GET interface") Signed-off-by: Jens Wiklander Reviewed-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20240311110700.2367142-1-jens.wiklander@linaro.org Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c index f2556a8e9401..9bc2e10381af 100644 --- a/drivers/firmware/arm_ffa/driver.c +++ b/drivers/firmware/arm_ffa/driver.c @@ -790,7 +790,7 @@ static void ffa_notification_info_get(void) part_id = packed_id_list[ids_processed++]; - if (!ids_count[list]) { /* Global Notification */ + if (ids_count[list] == 1) { /* Global Notification */ __do_sched_recv_cb(part_id, 0, false); continue; } From 17f243adf1653bdbaeec767e3e74c9ad089f470b Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 11 Mar 2024 10:04:12 +0100 Subject: [PATCH 011/270] firmware: arm_scmi: Fix wrong fastchannel initialization Fastchannels are initialized with an incorrect index(POWERCAP_PAI_GET) in: commit 2441caa84aac ("firmware: arm_scmi: Populate fast channel rate_limit") Fix this and provide a correct index(POWERCAP_FC_PAI) Fixes: 2441caa84aac ("firmware: arm_scmi: Populate fast channel rate_limit") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202403100744.7Op3PI8L-lkp@intel.com/ Signed-off-by: Pierre Gondois Link: https://lore.kernel.org/r/20240311090413.1710725-1-pierre.gondois@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/powercap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/powercap.c b/drivers/firmware/arm_scmi/powercap.c index ea9201e7044c..1fa79bba492e 100644 --- a/drivers/firmware/arm_scmi/powercap.c +++ b/drivers/firmware/arm_scmi/powercap.c @@ -736,7 +736,7 @@ static void scmi_powercap_domain_init_fc(const struct scmi_protocol_handle *ph, ph->hops->fastchannel_init(ph, POWERCAP_DESCRIBE_FASTCHANNEL, POWERCAP_PAI_GET, 4, domain, &fc[POWERCAP_FC_PAI].get_addr, NULL, - &fc[POWERCAP_PAI_GET].rate_limit); + &fc[POWERCAP_FC_PAI].rate_limit); *p_fc = fc; } From b70c7996d4ffb2e02895132e8a79a37cee66504f Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Fri, 15 Mar 2024 14:03:24 +0000 Subject: [PATCH 012/270] firmware: arm_scmi: Make raw debugfs entries non-seekable SCMI raw debugfs entries are used to inject and snoop messages out of the SCMI core and, as such, the underlying virtual files have no reason to support seeking. Modify the related file_operations descriptors to be non-seekable. Fixes: 3c3d818a9317 ("firmware: arm_scmi: Add core raw transmission support") Signed-off-by: Cristian Marussi Link: https://lore.kernel.org/r/20240315140324.231830-1-cristian.marussi@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/raw_mode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/raw_mode.c b/drivers/firmware/arm_scmi/raw_mode.c index 350573518503..130d13e9cd6b 100644 --- a/drivers/firmware/arm_scmi/raw_mode.c +++ b/drivers/firmware/arm_scmi/raw_mode.c @@ -921,7 +921,7 @@ static int scmi_dbg_raw_mode_open(struct inode *inode, struct file *filp) rd->raw = raw; filp->private_data = rd; - return 0; + return nonseekable_open(inode, filp); } static int scmi_dbg_raw_mode_release(struct inode *inode, struct file *filp) @@ -950,6 +950,7 @@ static const struct file_operations scmi_dbg_raw_mode_reset_fops = { .open = scmi_dbg_raw_mode_open, .release = scmi_dbg_raw_mode_release, .write = scmi_dbg_raw_mode_reset_write, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -959,6 +960,7 @@ static const struct file_operations scmi_dbg_raw_mode_message_fops = { .read = scmi_dbg_raw_mode_message_read, .write = scmi_dbg_raw_mode_message_write, .poll = scmi_dbg_raw_mode_message_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -975,6 +977,7 @@ static const struct file_operations scmi_dbg_raw_mode_message_async_fops = { .read = scmi_dbg_raw_mode_message_read, .write = scmi_dbg_raw_mode_message_async_write, .poll = scmi_dbg_raw_mode_message_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -998,6 +1001,7 @@ static const struct file_operations scmi_dbg_raw_mode_notification_fops = { .release = scmi_dbg_raw_mode_release, .read = scmi_test_dbg_raw_mode_notif_read, .poll = scmi_test_dbg_raw_mode_notif_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -1021,6 +1025,7 @@ static const struct file_operations scmi_dbg_raw_mode_errors_fops = { .release = scmi_dbg_raw_mode_release, .read = scmi_test_dbg_raw_mode_errors_read, .poll = scmi_test_dbg_raw_mode_errors_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; From b7c59b038c656214f56432867056997c2e0fc268 Mon Sep 17 00:00:00 2001 From: Yuquan Wang Date: Mon, 18 Mar 2024 10:29:28 +0800 Subject: [PATCH 013/270] cxl/mem: Fix for the index of Clear Event Record Handle The dev_dbg info for Clear Event Records mailbox command would report the handle of the next record to clear not the current one. This was because the index 'i' had incremented before printing the current handle value. Fixes: 6ebe28f9ec72 ("cxl/mem: Read, trace, and clear events on driver load") Signed-off-by: Yuquan Wang Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Reviewed-by: Fan Ni Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 9adda4795eb7..50146161887d 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -915,7 +915,7 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, payload->handles[i++] = gen->hdr.handle; dev_dbg(mds->cxlds.dev, "Event log '%d': Clearing %u\n", log, - le16_to_cpu(payload->handles[i])); + le16_to_cpu(payload->handles[i - 1])); if (i == max_handles) { payload->nr_recs = i; From 5c88a9ccd4c431d58b532e4158b6999a8350062c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 19 Mar 2024 11:15:08 -0700 Subject: [PATCH 014/270] cxl/core/regs: Fix usage of map->reg_type in cxl_decode_regblock() before assigned In the error path, map->reg_type is being used for kernel warning before its value is setup. Found by code inspection. Exposure to user is wrong reg_type being emitted via kernel log. Use a local var for reg_type and retrieve value for usage. Fixes: 6c7f4f1e51c2 ("cxl/core/regs: Make cxl_map_{component, device}_regs() device generic") Reviewed-by: Dan Williams Reviewed-by: Davidlohr Bueso Signed-off-by: Dave Jiang --- drivers/cxl/core/regs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 372786f80955..3c42f984eeaf 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -271,6 +271,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, CXL); static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, struct cxl_register_map *map) { + u8 reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); int bar = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); u64 offset = ((u64)reg_hi << 32) | (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); @@ -278,11 +279,11 @@ static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, if (offset > pci_resource_len(pdev, bar)) { dev_warn(&pdev->dev, "BAR%d: %pr: too small (offset: %pa, type: %d)\n", bar, - &pdev->resource[bar], &offset, map->reg_type); + &pdev->resource[bar], &offset, reg_type); return false; } - map->reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); + map->reg_type = reg_type; map->resource = pci_resource_start(pdev, bar) + offset; map->max_size = pci_resource_len(pdev, bar) - offset; return true; From 3cf5abf2860bc538620fc3dfca06f403964b787b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 26 Mar 2024 14:21:30 +0530 Subject: [PATCH 015/270] MAINTAINERS: Drop Gustavo Pimentel as PCI DWC Maintainer Gustavo Pimentel seems to have left Synopsys, so his email is bouncing. And there is no indication from him expressing willingless to continue contributing to the driver. Drop him from the MAINTAINERS entry and add a CREDITS entry. Link: https://lore.kernel.org/r/20240326085130.12487-1-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam [bhelgaas: add CREDITS entry] Signed-off-by: Bjorn Helgaas --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index c55c5a0ee4ff..0107047f807b 100644 --- a/CREDITS +++ b/CREDITS @@ -3146,6 +3146,10 @@ S: Triftstra=DFe 55 S: 13353 Berlin S: Germany +N: Gustavo Pimental +E: gustavo.pimentel@synopsys.com +D: PCI driver for Synopsys DesignWare + N: Emanuel Pirker E: epirker@edu.uni-klu.ac.at D: AIC5800 IEEE 1394, RAW I/O on 1394 diff --git a/MAINTAINERS b/MAINTAINERS index aa3b947fb080..f61540722269 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16966,7 +16966,6 @@ F: drivers/pci/controller/dwc/pci-exynos.c PCI DRIVER FOR SYNOPSYS DESIGNWARE M: Jingoo Han -M: Gustavo Pimentel M: Manivannan Sadhasivam L: linux-pci@vger.kernel.org S: Maintained From c90847bcbfb65d0f1c48fcc73a2b3a2d4ceac6a1 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Tue, 26 Mar 2024 22:45:24 -0700 Subject: [PATCH 016/270] cache: sifive_ccache: Partially convert to a platform driver Commit 8ec99b033147 ("irqchip/sifive-plic: Convert PLIC driver into a platform driver") broke ccache initialization because the PLIC IRQ domain is no longer available during an arch_initcall: [ 0.087229] irq: no irq domain found for interrupt-controller@c000000 ! [ 0.087255] CCACHE: Could not request IRQ 0 Fix this by moving the IRQ handling code to a platform driver. Fixes: 8ec99b033147 ("irqchip/sifive-plic: Convert PLIC driver into a platform driver") Signed-off-by: Samuel Holland Tested-by: Geert Uytterhoeven Signed-off-by: Conor Dooley --- drivers/cache/sifive_ccache.c | 72 ++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/cache/sifive_ccache.c b/drivers/cache/sifive_ccache.c index 89ed6cd6b059..e9cc8b4786fb 100644 --- a/drivers/cache/sifive_ccache.c +++ b/drivers/cache/sifive_ccache.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include #include @@ -247,13 +249,49 @@ static irqreturn_t ccache_int_handler(int irq, void *device) return IRQ_HANDLED; } +static int sifive_ccache_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + unsigned long quirks; + int intr_num, rc; + + quirks = (unsigned long)device_get_match_data(dev); + + intr_num = platform_irq_count(pdev); + if (!intr_num) + return dev_err_probe(dev, -ENODEV, "No interrupts property\n"); + + for (int i = 0; i < intr_num; i++) { + if (i == DATA_UNCORR && (quirks & QUIRK_BROKEN_DATA_UNCORR)) + continue; + + g_irq[i] = platform_get_irq(pdev, i); + if (g_irq[i] < 0) + return g_irq[i]; + + rc = devm_request_irq(dev, g_irq[i], ccache_int_handler, 0, "ccache_ecc", NULL); + if (rc) + return dev_err_probe(dev, rc, "Could not request IRQ %d\n", g_irq[i]); + } + + return 0; +} + +static struct platform_driver sifive_ccache_driver = { + .probe = sifive_ccache_probe, + .driver = { + .name = "sifive_ccache", + .of_match_table = sifive_ccache_ids, + }, +}; + static int __init sifive_ccache_init(void) { struct device_node *np; struct resource res; - int i, rc, intr_num; const struct of_device_id *match; unsigned long quirks; + int rc; np = of_find_matching_node_and_match(NULL, sifive_ccache_ids, &match); if (!np) @@ -277,28 +315,6 @@ static int __init sifive_ccache_init(void) goto err_unmap; } - intr_num = of_property_count_u32_elems(np, "interrupts"); - if (!intr_num) { - pr_err("No interrupts property\n"); - rc = -ENODEV; - goto err_unmap; - } - - for (i = 0; i < intr_num; i++) { - g_irq[i] = irq_of_parse_and_map(np, i); - - if (i == DATA_UNCORR && (quirks & QUIRK_BROKEN_DATA_UNCORR)) - continue; - - rc = request_irq(g_irq[i], ccache_int_handler, 0, "ccache_ecc", - NULL); - if (rc) { - pr_err("Could not request IRQ %d\n", g_irq[i]); - goto err_free_irq; - } - } - of_node_put(np); - #ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS if (quirks & QUIRK_NONSTANDARD_CACHE_OPS) { riscv_cbom_block_size = SIFIVE_CCACHE_LINE_SIZE; @@ -315,11 +331,15 @@ static int __init sifive_ccache_init(void) #ifdef CONFIG_DEBUG_FS setup_sifive_debug(); #endif + + rc = platform_driver_register(&sifive_ccache_driver); + if (rc) + goto err_unmap; + + of_node_put(np); + return 0; -err_free_irq: - while (--i >= 0) - free_irq(g_irq[i], NULL); err_unmap: iounmap(ccache_base); err_node_put: From 8cb10cba124c4798b6cb333245ecdc8dde78aeae Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Wed, 28 Feb 2024 12:02:15 -0800 Subject: [PATCH 017/270] arm64: dts: freescale: imx8mp-venice-gw72xx-2x: fix USB vbus regulator When using usb-conn-gpio to control USB role and VBUS, the vbus-supply property must be present in the usb-conn-gpio node. Additionally it should not be present in the phy node as that isn't what controls vbus and will upset the use count. This resolves an issue where VBUS is enabled with OTG in peripheral mode. Fixes: ad9a12f7a522 ("arm64: dts: imx8mp-venice: Fix USB connector description") Signed-off-by: Tim Harvey Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi index 41c79d2ebdd6..f24b14744799 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi @@ -14,6 +14,7 @@ pinctrl-0 = <&pinctrl_usbcon1>; type = "micro"; label = "otg"; + vbus-supply = <®_usb1_vbus>; id-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; port { @@ -183,7 +184,6 @@ }; &usb3_phy0 { - vbus-supply = <®_usb1_vbus>; status = "okay"; }; From 6f8e0aca838e163e81fde176e945161d50679339 Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Wed, 28 Feb 2024 12:02:16 -0800 Subject: [PATCH 018/270] arm64: dts: freescale: imx8mp-venice-gw73xx-2x: fix USB vbus regulator When using usb-conn-gpio to control USB role and VBUS, the vbus-supply property must be present in the usb-conn-gpio node. Additionally it should not be present in the phy node as that isn't what controls vbus and will upset the use count. This resolves an issue where VBUS is enabled with OTG in peripheral mode. Fixes: ad9a12f7a522 ("arm64: dts: imx8mp-venice: Fix USB connector description") Signed-off-by: Tim Harvey Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi index d5c400b355af..f5491a608b2f 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi @@ -14,6 +14,7 @@ pinctrl-0 = <&pinctrl_usbcon1>; type = "micro"; label = "otg"; + vbus-supply = <®_usb1_vbus>; id-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; port { @@ -202,7 +203,6 @@ }; &usb3_phy0 { - vbus-supply = <®_usb1_vbus>; status = "okay"; }; From 302b84e84d108b878efc56ebfea09474159be56b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 28 Mar 2024 16:07:23 -0500 Subject: [PATCH 019/270] Revert "PCI: Mark LSI FW643 to avoid bus reset" This reverts commit 29a43dc130ce65d365a8ea9e1cc4bc51005a353e. 29a43dc130ce ("PCI: Mark LSI FW643 to avoid bus reset") by Edmund was based on the assumption that the LSI / Agere FW643 has a defect such that it can't recover after a Secondary Bus Reset (SBR). But Takashi Sakamoto reported that SBR works fine on this same FW643 device in an AMD Ryzen 5 2400G system, so apparently there is some other aspect of Edmund's system that accounts for the issue. The down side of 29a43dc130ce is that when the FW643 is assigned to a VM, avoiding the SBR means we leak data out of the VM. Revert 29a43dc130ce until we figure out a better solution. In the meantime, we can use the sysfs "reset_method" interface to restrict the available reset methods. Link: https://lore.kernel.org/r/20240328212302.1582483-1-helgaas@kernel.org Fixes: 29a43dc130ce ("PCI: Mark LSI FW643 to avoid bus reset") Reported-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20240325012135.36861-1-o-takashi@sakamocchi.jp Signed-off-by: Bjorn Helgaas Reviewed-by: Takashi Sakamoto --- drivers/pci/quirks.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index bf4833221816..eff7f5df08e2 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3765,14 +3765,6 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003e, quirk_no_bus_reset); */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_CAVIUM, 0xa100, quirk_no_bus_reset); -/* - * Apparently the LSI / Agere FW643 can't recover after a Secondary Bus - * Reset and requires a power-off or suspend/resume and rescan. Prevent - * use of that reset. - */ -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATT, 0x5900, quirk_no_bus_reset); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATT, 0x5901, quirk_no_bus_reset); - /* * Some TI KeyStone C667X devices do not support bus/hot reset. The PCIESS * automatically disables LTSSM when Secondary Bus Reset is received and From 0640f47b742667fca6aac174f7cd62b6c2c7532c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 13 Mar 2024 17:43:05 +0100 Subject: [PATCH 020/270] drm/msm/dp: fix runtime PM leak on disconnect Make sure to put the runtime PM usage count (and suspend) also when receiving a disconnect event while in the ST_MAINLINK_READY state. This specifically avoids leaking a runtime PM usage count on every disconnect with display servers that do not automatically enable external displays when receiving a hotplug notification. Fixes: 5814b8bf086a ("drm/msm/dp: incorporate pm_runtime framework into DP driver") Cc: stable@vger.kernel.org # 6.8 Cc: Kuogee Hsieh Signed-off-by: Johan Hovold Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/582744/ Link: https://lore.kernel.org/r/20240313164306.23133-2-johan+linaro@kernel.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_display.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index d80f89581760..ffa785ec641a 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -629,6 +629,7 @@ static int dp_hpd_unplug_handle(struct dp_display_private *dp, u32 data) dp_display_host_phy_exit(dp); dp->hpd_state = ST_DISCONNECTED; dp_display_notify_disconnect(&dp->dp_display.pdev->dev); + pm_runtime_put_sync(&pdev->dev); mutex_unlock(&dp->event_mutex); return 0; } From e86750b01a1560f198e4b3e21bb3f78bfd5bb2c3 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 13 Mar 2024 17:43:06 +0100 Subject: [PATCH 021/270] drm/msm/dp: fix runtime PM leak on connect failure Make sure to balance the runtime PM usage counter (and suspend) before returning on connect failures (e.g. DPCD read failures after a spurious connect event or if link training fails). Fixes: 5814b8bf086a ("drm/msm/dp: incorporate pm_runtime framework into DP driver") Cc: stable@vger.kernel.org # 6.8 Cc: Kuogee Hsieh Signed-off-by: Johan Hovold Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/582746/ Link: https://lore.kernel.org/r/20240313164306.23133-3-johan+linaro@kernel.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_display.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index ffa785ec641a..2ed40bd0acb6 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -572,6 +572,7 @@ static int dp_hpd_plug_handle(struct dp_display_private *dp, u32 data) ret = dp_display_usbpd_configure_cb(&pdev->dev); if (ret) { /* link train failed */ dp->hpd_state = ST_DISCONNECTED; + pm_runtime_put_sync(&pdev->dev); } else { dp->hpd_state = ST_MAINLINK_READY; } From c588f7d67044d6d59ef92d75a970b64929984d89 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 14:08:09 -0700 Subject: [PATCH 022/270] drm/msm: Add newlines to some debug prints These debug prints are missing newlines, leading to multiple messages being printed on one line and hard to read logs. Add newlines to have the debug prints on separate lines. The DBG macro used to add a newline, but I missed that while migrating to drm_dbg wrappers. Fixes: 7cb017db1896 ("drm/msm: Move FB debug prints to drm_dbg_state()") Fixes: 721c6e0c6aed ("drm/msm: Move vblank debug prints to drm_dbg_vbl()") Signed-off-by: Stephen Boyd Reviewed-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/584769/ Link: https://lore.kernel.org/r/20240325210810.1340820-1-swboyd@chromium.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/msm_fb.c | 6 +++--- drivers/gpu/drm/msm/msm_kms.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_fb.c b/drivers/gpu/drm/msm/msm_fb.c index e3f61c39df69..80166f702a0d 100644 --- a/drivers/gpu/drm/msm/msm_fb.c +++ b/drivers/gpu/drm/msm/msm_fb.c @@ -89,7 +89,7 @@ int msm_framebuffer_prepare(struct drm_framebuffer *fb, for (i = 0; i < n; i++) { ret = msm_gem_get_and_pin_iova(fb->obj[i], aspace, &msm_fb->iova[i]); - drm_dbg_state(fb->dev, "FB[%u]: iova[%d]: %08llx (%d)", + drm_dbg_state(fb->dev, "FB[%u]: iova[%d]: %08llx (%d)\n", fb->base.id, i, msm_fb->iova[i], ret); if (ret) return ret; @@ -176,7 +176,7 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev, const struct msm_format *format; int ret, i, n; - drm_dbg_state(dev, "create framebuffer: mode_cmd=%p (%dx%d@%4.4s)", + drm_dbg_state(dev, "create framebuffer: mode_cmd=%p (%dx%d@%4.4s)\n", mode_cmd, mode_cmd->width, mode_cmd->height, (char *)&mode_cmd->pixel_format); @@ -232,7 +232,7 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev, refcount_set(&msm_fb->dirtyfb, 1); - drm_dbg_state(dev, "create: FB ID: %d (%p)", fb->base.id, fb); + drm_dbg_state(dev, "create: FB ID: %d (%p)\n", fb->base.id, fb); return fb; diff --git a/drivers/gpu/drm/msm/msm_kms.c b/drivers/gpu/drm/msm/msm_kms.c index 84c21ec2ceea..af6a6fcb1173 100644 --- a/drivers/gpu/drm/msm/msm_kms.c +++ b/drivers/gpu/drm/msm/msm_kms.c @@ -149,7 +149,7 @@ int msm_crtc_enable_vblank(struct drm_crtc *crtc) struct msm_kms *kms = priv->kms; if (!kms) return -ENXIO; - drm_dbg_vbl(dev, "crtc=%u", crtc->base.id); + drm_dbg_vbl(dev, "crtc=%u\n", crtc->base.id); return vblank_ctrl_queue_work(priv, crtc, true); } @@ -160,7 +160,7 @@ void msm_crtc_disable_vblank(struct drm_crtc *crtc) struct msm_kms *kms = priv->kms; if (!kms) return; - drm_dbg_vbl(dev, "crtc=%u", crtc->base.id); + drm_dbg_vbl(dev, "crtc=%u\n", crtc->base.id); vblank_ctrl_queue_work(priv, crtc, false); } From 4f3b77ae5ff5b5ba9d99c5d5450db388dbee5107 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 14 Mar 2024 03:10:41 +0200 Subject: [PATCH 023/270] drm/msm/dpu: don't allow overriding data from catalog The data from catalog is marked as const, so it is a part of the RO segment. Allowing userspace to write to it through debugfs can cause protection faults. Set debugfs file mode to read-only for debug entries corresponding to perf_cfg coming from catalog. Fixes: abda0d925f9c ("drm/msm/dpu: Mark various data tables as const") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/582844/ Link: https://lore.kernel.org/r/20240314-dpu-perf-rework-v3-1-79fa4e065574@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c index ef871239adb2..68fae048a9a8 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c @@ -459,15 +459,15 @@ int dpu_core_perf_debugfs_init(struct dpu_kms *dpu_kms, struct dentry *parent) &perf->core_clk_rate); debugfs_create_u32("enable_bw_release", 0600, entry, (u32 *)&perf->enable_bw_release); - debugfs_create_u32("threshold_low", 0600, entry, + debugfs_create_u32("threshold_low", 0400, entry, (u32 *)&perf->perf_cfg->max_bw_low); - debugfs_create_u32("threshold_high", 0600, entry, + debugfs_create_u32("threshold_high", 0400, entry, (u32 *)&perf->perf_cfg->max_bw_high); - debugfs_create_u32("min_core_ib", 0600, entry, + debugfs_create_u32("min_core_ib", 0400, entry, (u32 *)&perf->perf_cfg->min_core_ib); - debugfs_create_u32("min_llcc_ib", 0600, entry, + debugfs_create_u32("min_llcc_ib", 0400, entry, (u32 *)&perf->perf_cfg->min_llcc_ib); - debugfs_create_u32("min_dram_ib", 0600, entry, + debugfs_create_u32("min_dram_ib", 0400, entry, (u32 *)&perf->perf_cfg->min_dram_ib); debugfs_create_file("perf_mode", 0600, entry, (u32 *)perf, &dpu_core_perf_mode_fops); From ee15c8bf5d77a306614bdefe33828310662dee05 Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Fri, 29 Mar 2024 12:46:26 -0700 Subject: [PATCH 024/270] drm/msm/dp: assign correct DP controller ID to x1e80100 interface table At current x1e80100 interface table, interface #3 is wrongly connected to DP controller #0 and interface #4 wrongly connected to DP controller #2. Fix this problem by connect Interface #3 to DP controller #0 and interface #4 connect to DP controller #1. Also add interface #6, #7 and #8 connections to DP controller to complete x1e80100 interface table. Changs in V3: -- add v2 changes log Changs in V2: -- add x1e80100 to subject -- add Fixes Fixes: e3b1f369db5a ("drm/msm/dpu: Add X1E80100 support") Signed-off-by: Kuogee Hsieh Reviewed-by: Abhinav Kumar Reviewed-by: Abel Vesa Patchwork: https://patchwork.freedesktop.org/patch/585549/ Link: https://lore.kernel.org/r/1711741586-9037-1-git-send-email-quic_khsieh@quicinc.com Signed-off-by: Abhinav Kumar --- .../msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h index 9a9f7092c526..a3e60ac70689 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h @@ -324,6 +324,7 @@ static const struct dpu_wb_cfg x1e80100_wb[] = { }, }; +/* TODO: INTF 3, 8 and 7 are used for MST, marked as INTF_NONE for now */ static const struct dpu_intf_cfg x1e80100_intf[] = { { .name = "intf_0", .id = INTF_0, @@ -358,8 +359,8 @@ static const struct dpu_intf_cfg x1e80100_intf[] = { .name = "intf_3", .id = INTF_3, .base = 0x37000, .len = 0x280, .features = INTF_SC7280_MASK, - .type = INTF_DP, - .controller_id = MSM_DP_CONTROLLER_1, + .type = INTF_NONE, + .controller_id = MSM_DP_CONTROLLER_0, /* pair with intf_0 for DP MST */ .prog_fetch_lines_worst_case = 24, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 30), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 31), @@ -368,7 +369,7 @@ static const struct dpu_intf_cfg x1e80100_intf[] = { .base = 0x38000, .len = 0x280, .features = INTF_SC7280_MASK, .type = INTF_DP, - .controller_id = MSM_DP_CONTROLLER_2, + .controller_id = MSM_DP_CONTROLLER_1, .prog_fetch_lines_worst_case = 24, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 20), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 21), @@ -381,6 +382,33 @@ static const struct dpu_intf_cfg x1e80100_intf[] = { .prog_fetch_lines_worst_case = 24, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 22), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 23), + }, { + .name = "intf_6", .id = INTF_6, + .base = 0x3A000, .len = 0x280, + .features = INTF_SC7280_MASK, + .type = INTF_DP, + .controller_id = MSM_DP_CONTROLLER_2, + .prog_fetch_lines_worst_case = 24, + .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 17), + .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 16), + }, { + .name = "intf_7", .id = INTF_7, + .base = 0x3b000, .len = 0x280, + .features = INTF_SC7280_MASK, + .type = INTF_NONE, + .controller_id = MSM_DP_CONTROLLER_2, /* pair with intf_6 for DP MST */ + .prog_fetch_lines_worst_case = 24, + .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 18), + .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 19), + }, { + .name = "intf_8", .id = INTF_8, + .base = 0x3c000, .len = 0x280, + .features = INTF_SC7280_MASK, + .type = INTF_NONE, + .controller_id = MSM_DP_CONTROLLER_1, /* pair with intf_4 for DP MST */ + .prog_fetch_lines_worst_case = 24, + .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12), + .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13), }, }; From 8844f467d6a58dc915f241e81c46e0c126f8c070 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 30 Mar 2024 05:53:22 +0200 Subject: [PATCH 025/270] drm/msm/dpu: make error messages at dpu_core_irq_register_callback() more sensible There is little point in using %ps to print a value known to be NULL. On the other hand it makes sense to print the callback symbol in the 'invalid IRQ' message. Correct those two error messages to make more sense. Fixes: 6893199183f8 ("drm/msm/dpu: stop using raw IRQ indices in the kernel output") Signed-off-by: Dmitry Baryshkov Reviewed-by: Marijn Suijten Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/585565/ Link: https://lore.kernel.org/r/20240330-dpu-irq-messages-v1-1-9ce782ae35f9@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c index 946dd0135dff..6a0a74832fb6 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c @@ -525,14 +525,14 @@ int dpu_core_irq_register_callback(struct dpu_kms *dpu_kms, int ret; if (!irq_cb) { - DPU_ERROR("invalid IRQ=[%d, %d] irq_cb:%ps\n", - DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx), irq_cb); + DPU_ERROR("IRQ=[%d, %d] NULL callback\n", + DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx)); return -EINVAL; } if (!dpu_core_irq_is_valid(irq_idx)) { - DPU_ERROR("invalid IRQ=[%d, %d]\n", - DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx)); + DPU_ERROR("invalid IRQ=[%d, %d] irq_cb:%ps\n", + DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx), irq_cb); return -EINVAL; } From cd49cca222bc5532105a1f20bff6ae60d7bf7713 Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Wed, 6 Mar 2024 11:35:15 -0800 Subject: [PATCH 026/270] drm/msm/dp: fix typo in dp_display_handle_port_status_changed() Fix the typo in the name of dp_display_handle_port_status_changed(). Fixes: c58eb1b54fee ("drm/msm/dp: fix connect/disconnect handled at irq_hpd") Signed-off-by: Abhinav Kumar Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/581746/ Link: https://lore.kernel.org/r/20240306193515.455388-1-quic_abhinavk@quicinc.com --- drivers/gpu/drm/msm/dp/dp_display.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 2ed40bd0acb6..9575d4b80c29 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -468,7 +468,7 @@ static void dp_display_handle_video_request(struct dp_display_private *dp) } } -static int dp_display_handle_port_ststus_changed(struct dp_display_private *dp) +static int dp_display_handle_port_status_changed(struct dp_display_private *dp) { int rc = 0; @@ -525,7 +525,7 @@ static int dp_display_usbpd_attention_cb(struct device *dev) drm_dbg_dp(dp->drm_dev, "hpd_state=%d sink_request=%d\n", dp->hpd_state, sink_request); if (sink_request & DS_PORT_STATUS_CHANGED) - rc = dp_display_handle_port_ststus_changed(dp); + rc = dp_display_handle_port_status_changed(dp); else rc = dp_display_handle_irq_hpd(dp); } From be1b7acb929137e3943fe380671242beb485190c Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 2 Apr 2024 05:57:15 +0300 Subject: [PATCH 027/270] dt-bindings: display/msm: sm8150-mdss: add DP node As Qualcomm SM8150 got support for the DisplayPort, add displayport@ node as a valid child to the MDSS node. Fixes: 88806318e2c2 ("dt-bindings: display: msm: dp: declare compatible string for sm8150") Reviewed-by: Krzysztof Kozlowski Signed-off-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/586156/ Link: https://lore.kernel.org/r/20240402-fd-fix-schema-v3-1-817ea6ddf775@linaro.org Signed-off-by: Abhinav Kumar --- .../bindings/display/msm/qcom,sm8150-mdss.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml index c0d6a4fdff97..e6dc5494baee 100644 --- a/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml +++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml @@ -53,6 +53,15 @@ patternProperties: compatible: const: qcom,sm8150-dpu + "^displayport-controller@[0-9a-f]+$": + type: object + additionalProperties: true + + properties: + compatible: + contains: + const: qcom,sm8150-dp + "^dsi@[0-9a-f]+$": type: object additionalProperties: true From c6ddd6e7b166532a0816825442ff60f70aed9647 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 22 Mar 2024 12:47:05 -0400 Subject: [PATCH 028/270] arm64: dts: imx8-ss-conn: fix usdhc wrong lpcg clock order The actual clock show wrong frequency: echo on >/sys/devices/platform/bus\@5b000000/5b010000.mmc/power/control cat /sys/kernel/debug/mmc0/ios clock: 200000000 Hz actual clock: 166000000 Hz ^^^^^^^^^ ..... According to sdhc0_lpcg: clock-controller@5b200000 { compatible = "fsl,imx8qxp-lpcg"; reg = <0x5b200000 0x10000>; #clock-cells = <1>; clocks = <&clk IMX_SC_R_SDHC_0 IMX_SC_PM_CLK_PER>, <&conn_ipg_clk>, <&conn_axi_clk>; clock-indices = , , ; clock-output-names = "sdhc0_lpcg_per_clk", "sdhc0_lpcg_ipg_clk", "sdhc0_lpcg_ahb_clk"; power-domains = <&pd IMX_SC_R_SDHC_0>; } "per_clk" should be IMX_LPCG_CLK_0 instead of IMX_LPCG_CLK_5. After correct clocks order: echo on >/sys/devices/platform/bus\@5b000000/5b010000.mmc/power/control cat /sys/kernel/debug/mmc0/ios clock: 200000000 Hz actual clock: 198000000 Hz ^^^^^^^^ ... Fixes: 16c4ea7501b1 ("arm64: dts: imx8: switch to new lpcg clock binding") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi index 3c42240e78e2..af2259e99796 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi @@ -67,8 +67,8 @@ conn_subsys: bus@5b000000 { interrupts = ; reg = <0x5b010000 0x10000>; clocks = <&sdhc0_lpcg IMX_LPCG_CLK_4>, - <&sdhc0_lpcg IMX_LPCG_CLK_0>, - <&sdhc0_lpcg IMX_LPCG_CLK_5>; + <&sdhc0_lpcg IMX_LPCG_CLK_5>, + <&sdhc0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "ahb", "per"; power-domains = <&pd IMX_SC_R_SDHC_0>; status = "disabled"; @@ -78,8 +78,8 @@ conn_subsys: bus@5b000000 { interrupts = ; reg = <0x5b020000 0x10000>; clocks = <&sdhc1_lpcg IMX_LPCG_CLK_4>, - <&sdhc1_lpcg IMX_LPCG_CLK_0>, - <&sdhc1_lpcg IMX_LPCG_CLK_5>; + <&sdhc1_lpcg IMX_LPCG_CLK_5>, + <&sdhc1_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "ahb", "per"; power-domains = <&pd IMX_SC_R_SDHC_1>; fsl,tuning-start-tap = <20>; @@ -91,8 +91,8 @@ conn_subsys: bus@5b000000 { interrupts = ; reg = <0x5b030000 0x10000>; clocks = <&sdhc2_lpcg IMX_LPCG_CLK_4>, - <&sdhc2_lpcg IMX_LPCG_CLK_0>, - <&sdhc2_lpcg IMX_LPCG_CLK_5>; + <&sdhc2_lpcg IMX_LPCG_CLK_5>, + <&sdhc2_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "ahb", "per"; power-domains = <&pd IMX_SC_R_SDHC_2>; status = "disabled"; From b91695b50d5b5c7c6a2cae08637b0a119cec0e12 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 25 Mar 2024 09:14:04 -0300 Subject: [PATCH 029/270] ARM: dts: imx7-mba7: Use 'no-mmc' property 'no-emmc' is not a valid property. The original intention was to use the 'no-mmc' property. Change it accordingly to fix the following dt-schema warning: imx7s-mba7.dtb: mmc@30b40000: Unevaluated properties are not allowed ('no-emmc' was unexpected) Fixes: d430a7e0e181 ("ARM: dts: imx7-mba7: restrict usdhc interface modes") Signed-off-by: Fabio Estevam Reviewed-by: Alexander Stein Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi b/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi index 1235a71c6abe..52869e68f833 100644 --- a/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi @@ -666,7 +666,7 @@ bus-width = <4>; no-1-8-v; no-sdio; - no-emmc; + no-mmc; status = "okay"; }; From af133562d5aff41fcdbe51f1a504ae04788b5fc0 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 25 Mar 2024 09:31:04 +0100 Subject: [PATCH 030/270] swiotlb: extend buffer pre-padding to alloc_align_mask if necessary Allow a buffer pre-padding of up to alloc_align_mask, even if it requires allocating additional IO TLB slots. If the allocation alignment is bigger than IO_TLB_SIZE and min_align_mask covers any non-zero bits in the original address between IO_TLB_SIZE and alloc_align_mask, these bits are not preserved in the swiotlb buffer address. To fix this case, increase the allocation size and use a larger offset within the allocated buffer. As a result, extra padding slots may be allocated before the mapping start address. Leave orig_addr in these padding slots initialized to INVALID_PHYS_ADDR. These slots do not correspond to any CPU buffer, so attempts to sync the data should be ignored. The padding slots should be automatically released when the buffer is unmapped. However, swiotlb_tbl_unmap_single() takes only the address of the DMA buffer slot, not the first padding slot. Save the number of padding slots in struct io_tlb_slot and use it to adjust the slot index in swiotlb_release_slots(), so all allocated slots are properly freed. Fixes: 2fd4fa5d3fb5 ("swiotlb: Fix alignment checks when both allocation and DMA masks are present") Link: https://lore.kernel.org/linux-iommu/20240311210507.217daf8b@meshulam.tesarici.cz/ Signed-off-by: Petr Tesarik Reviewed-by: Michael Kelley Tested-by: Michael Kelley Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 59 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 86fe172b5958..d7a8cb93ef2d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -69,11 +69,14 @@ * @alloc_size: Size of the allocated buffer. * @list: The free list describing the number of free entries available * from each index. + * @pad_slots: Number of preceding padding slots. Valid only in the first + * allocated non-padding slot. */ struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; - unsigned int list; + unsigned short list; + unsigned short pad_slots; }; static bool swiotlb_force_bounce; @@ -287,6 +290,7 @@ static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start, mem->nslabs - i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; + mem->slots[i].pad_slots = 0; } memset(vaddr, 0, bytes); @@ -821,12 +825,30 @@ void swiotlb_dev_init(struct device *dev) #endif } -/* - * Return the offset into a iotlb slot required to keep the device happy. +/** + * swiotlb_align_offset() - Get required offset into an IO TLB allocation. + * @dev: Owning device. + * @align_mask: Allocation alignment mask. + * @addr: DMA address. + * + * Return the minimum offset from the start of an IO TLB allocation which is + * required for a given buffer address and allocation alignment to keep the + * device happy. + * + * First, the address bits covered by min_align_mask must be identical in the + * original address and the bounce buffer address. High bits are preserved by + * choosing a suitable IO TLB slot, but bits below IO_TLB_SHIFT require extra + * padding bytes before the bounce buffer. + * + * Second, @align_mask specifies which bits of the first allocated slot must + * be zero. This may require allocating additional padding slots, and then the + * offset (in bytes) from the first such padding slot is returned. */ -static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) +static unsigned int swiotlb_align_offset(struct device *dev, + unsigned int align_mask, u64 addr) { - return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); + return addr & dma_get_min_align_mask(dev) & + (align_mask | (IO_TLB_SIZE - 1)); } /* @@ -847,7 +869,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size return; tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); - orig_addr_offset = swiotlb_align_offset(dev, orig_addr); + orig_addr_offset = swiotlb_align_offset(dev, 0, orig_addr); if (tlb_offset < orig_addr_offset) { dev_WARN_ONCE(dev, 1, "Access before mapping start detected. orig offset %u, requested offset %u.\n", @@ -1005,7 +1027,7 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = dma_get_min_align_mask(dev); unsigned int nslots = nr_slots(alloc_size), stride; - unsigned int offset = swiotlb_align_offset(dev, orig_addr); + unsigned int offset = swiotlb_align_offset(dev, 0, orig_addr); unsigned int index, slots_checked, count = 0, i; unsigned long flags; unsigned int slot_base; @@ -1328,11 +1350,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, unsigned long attrs) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; - unsigned int offset = swiotlb_align_offset(dev, orig_addr); + unsigned int offset; struct io_tlb_pool *pool; unsigned int i; int index; phys_addr_t tlb_addr; + unsigned short pad_slots; if (!mem || !mem->nslabs) { dev_warn_ratelimited(dev, @@ -1349,6 +1372,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } + offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr); index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset, alloc_align_mask, &pool); if (index == -1) { @@ -1364,6 +1388,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ + pad_slots = offset >> IO_TLB_SHIFT; + offset &= (IO_TLB_SIZE - 1); + index += pad_slots; + pool->slots[index].pad_slots = pad_slots; for (i = 0; i < nr_slots(alloc_size + offset); i++) pool->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(pool->start, index) + offset; @@ -1384,13 +1412,17 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) { struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr); unsigned long flags; - unsigned int offset = swiotlb_align_offset(dev, tlb_addr); - int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; - int nslots = nr_slots(mem->slots[index].alloc_size + offset); - int aindex = index / mem->area_nslabs; - struct io_tlb_area *area = &mem->areas[aindex]; + unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr); + int index, nslots, aindex; + struct io_tlb_area *area; int count, i; + index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; + index -= mem->slots[index].pad_slots; + nslots = nr_slots(mem->slots[index].alloc_size + offset); + aindex = index / mem->area_nslabs; + area = &mem->areas[aindex]; + /* * Return the buffer to the free list by setting the corresponding * entries to indicate the number of contiguous entries available. @@ -1413,6 +1445,7 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) mem->slots[i].list = ++count; mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; + mem->slots[i].pad_slots = 0; } /* From e8068f2d756d57a5206fa3180ade365a8c12ed85 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Tue, 26 Mar 2024 20:45:48 -0700 Subject: [PATCH 031/270] swiotlb: fix swiotlb_bounce() to do partial sync's correctly In current code, swiotlb_bounce() may do partial sync's correctly in some circumstances, but may incorrectly fail in other circumstances. The failure cases require both of these to be true: 1) swiotlb_align_offset() returns a non-zero "offset" value 2) the tlb_addr of the partial sync area points into the first "offset" bytes of the _second_ or subsequent swiotlb slot allocated for the mapping Code added in commit 868c9ddc182b ("swiotlb: add overflow checks to swiotlb_bounce") attempts to WARN on the invalid case where tlb_addr points into the first "offset" bytes of the _first_ allocated slot. But there's no way for swiotlb_bounce() to distinguish the first slot from the second and subsequent slots, so the WARN can be triggered incorrectly when #2 above is true. Related, current code calculates an adjustment to the orig_addr stored in the swiotlb slot. The adjustment compensates for the difference in the tlb_addr used for the partial sync vs. the tlb_addr for the full mapping. The adjustment is stored in the local variable tlb_offset. But when #1 and #2 above are true, it's valid for this adjustment to be negative. In such case the arithmetic to adjust orig_addr produces the wrong result due to tlb_offset being declared as unsigned. Fix these problems by removing the over-constraining validations added in 868c9ddc182b. Change the declaration of tlb_offset to be signed instead of unsigned so the adjustment arithmetic works correctly. Tested with a test-only hack to how swiotlb_tbl_map_single() calls swiotlb_bounce(). Instead of calling swiotlb_bounce() just once for the entire mapped area, do a loop with each iteration doing only a 128 byte partial sync until the entire mapped area is sync'ed. Then with swiotlb=force on the kernel boot line, run a variety of raw disk writes followed by read and verification of all bytes of the written data. The storage device has DMA min_align_mask set, and the writes are done with a variety of original buffer memory address alignments and overall buffer sizes. For many of the combinations, current code triggers the WARN statements, or the data verification fails. With the fixes, no WARNs occur and all verifications pass. Fixes: 5f89468e2f06 ("swiotlb: manipulate orig_addr when tlb_addr has offset") Fixes: 868c9ddc182b ("swiotlb: add overflow checks to swiotlb_bounce") Signed-off-by: Michael Kelley Dominique Martinet Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d7a8cb93ef2d..d57c8837c813 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -863,27 +863,23 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start; - unsigned int tlb_offset, orig_addr_offset; + int tlb_offset; if (orig_addr == INVALID_PHYS_ADDR) return; - tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); - orig_addr_offset = swiotlb_align_offset(dev, 0, orig_addr); - if (tlb_offset < orig_addr_offset) { - dev_WARN_ONCE(dev, 1, - "Access before mapping start detected. orig offset %u, requested offset %u.\n", - orig_addr_offset, tlb_offset); - return; - } - - tlb_offset -= orig_addr_offset; - if (tlb_offset > alloc_size) { - dev_WARN_ONCE(dev, 1, - "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n", - alloc_size, size, tlb_offset); - return; - } + /* + * It's valid for tlb_offset to be negative. This can happen when the + * "offset" returned by swiotlb_align_offset() is non-zero, and the + * tlb_addr is pointing within the first "offset" bytes of the second + * or subsequent slots of the allocated swiotlb area. While it's not + * valid for tlb_addr to be pointing within the first "offset" bytes + * of the first slot, there's no way to check for such an error since + * this function can't distinguish the first slot from the second and + * subsequent slots. + */ + tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) - + swiotlb_align_offset(dev, 0, orig_addr); orig_addr += tlb_offset; alloc_size -= tlb_offset; From a1255ccab8ecee89905ddb12161139b0d878a7f2 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 29 Mar 2024 12:28:09 -0700 Subject: [PATCH 032/270] swiotlb: do not set total_used to 0 in swiotlb_create_debugfs_files() Sometimes the readout of /sys/kernel/debug/swiotlb/io_tlb_used and io_tlb_used_hiwater can be a huge number (e.g. 18446744073709551615), which is actually a negative number if we use "%ld" to print the number. When swiotlb_create_default_debugfs() is running from late_initcall, mem->total_used may already be non-zero, because the storage driver may have already started to perform I/O operations: if the storage driver is built-in, its probe() callback is called before late_initcall. swiotlb_create_debugfs_files() should not blindly set mem->total_used and mem->used_hiwater to 0; actually it doesn't have to initialize the fields at all, because the fields, as part of the global struct io_tlb_default_mem, have been implicitly initialized to zero. Also don't explicitly set mem->transient_nslabs to 0. Fixes: 8b0977ecc8b3 ("swiotlb: track and report io_tlb_used high water marks in debugfs") Fixes: 02e765697038 ("swiotlb: add debugfs to track swiotlb transient pool usage") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Reviewed-by: ZhangPeng Reviewed-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d57c8837c813..a5e0dfc44d24 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1676,9 +1676,6 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get, static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { - atomic_long_set(&mem->total_used, 0); - atomic_long_set(&mem->used_hiwater, 0); - mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); if (!mem->nslabs) return; @@ -1689,7 +1686,6 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem, &fops_io_tlb_hiwater); #ifdef CONFIG_SWIOTLB_DYNAMIC - atomic_long_set(&mem->transient_nslabs, 0); debugfs_create_file("io_tlb_transient_nslabs", 0400, mem->debugfs, mem, &fops_io_tlb_transient_used); #endif From 135f218255b28c5bbf71e9e32a49e5c734cabbe5 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 28 Mar 2024 12:19:54 -0300 Subject: [PATCH 033/270] ARM: dts: imx7s-warp: Pass OV2680 link-frequencies Since commit 63b0cd30b78e ("media: ov2680: Add bus-cfg / endpoint property verification") the ov2680 no longer probes on a imx7s-warp7: ov2680 1-0036: error -EINVAL: supported link freq 330000000 not found ov2680 1-0036: probe with driver ov2680 failed with error -22 Fix it by passing the required 'link-frequencies' property as recommended by: https://www.kernel.org/doc/html/v6.9-rc1/driver-api/media/camera-sensor.html#handling-clocks Cc: stable@vger.kernel.org Fixes: 63b0cd30b78e ("media: ov2680: Add bus-cfg / endpoint property verification") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx7s-warp.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts b/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts index ba7231b364bb..7bab113ca6da 100644 --- a/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts +++ b/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts @@ -210,6 +210,7 @@ remote-endpoint = <&mipi_from_sensor>; clock-lanes = <0>; data-lanes = <1>; + link-frequencies = /bits/ 64 <330000000>; }; }; }; From 1d86c2b3946e69d6b0b93568d312aae6247847c0 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:03 -0400 Subject: [PATCH 034/270] arm64: dts: imx8-ss-lsio: fix pwm lpcg indices lpcg's arg0 should use clock indices instead of index. pwm0_lpcg: clock-controller@5d400000 { ... // Col1 Col2 clocks = <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>, // 0 0 <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>, // 1 1 <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>, // 2 4 <&lsio_bus_clk>, // 3 5 <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>; // 4 6 clock-indices = , , , , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. pwm1 { .... clocks = <&pwm1_lpcg 4>, <&pwm1_lpcg 1>; ^^ ^^ should be: clocks = <&pwm1_lpcg IMX_LPCG_CLK_6>, <&pwm1_lpcg IMX_LPCG_CLK_1>; }; Arg0 is divided by 4 in lpcg driver, so index 0 and 1 will be get by pwm driver, which are same as IMX_LPCG_CLK_6 and IMX_LPCG_CLK_1. Even it can work, but code logic is wrong. Fixed it by use correct indices. Cc: stable@vger.kernel.org Fixes: 23fa99b205ea ("arm64: dts: freescale: imx8-ss-lsio: add support for lsio_pwm0-3") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi index 7e510b21bbac..764c1a08e3b1 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi @@ -25,8 +25,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d000000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm0_lpcg 4>, - <&pwm0_lpcg 1>; + clocks = <&pwm0_lpcg IMX_LPCG_CLK_6>, + <&pwm0_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; @@ -38,8 +38,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d010000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm1_lpcg 4>, - <&pwm1_lpcg 1>; + clocks = <&pwm1_lpcg IMX_LPCG_CLK_6>, + <&pwm1_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_1 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; @@ -51,8 +51,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d020000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm2_lpcg 4>, - <&pwm2_lpcg 1>; + clocks = <&pwm2_lpcg IMX_LPCG_CLK_6>, + <&pwm2_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_2 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; @@ -64,8 +64,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d030000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm3_lpcg 4>, - <&pwm3_lpcg 1>; + clocks = <&pwm3_lpcg IMX_LPCG_CLK_6>, + <&pwm3_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_3 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; From 808e7716edcdb39d3498b9f567ef6017858b49aa Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:04 -0400 Subject: [PATCH 035/270] arm64: dts: imx8-ss-conn: fix usb lpcg indices usb2_lpcg: clock-controller@5b270000 { ... Col1 Col2 clocks = <&conn_ahb_clk>, <&conn_ipg_clk>; // 0 6 clock-indices = , ; // 0 7 ... }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. usbotg1: usb@5b0d0000 { ... clocks = <&usb2_lpcg 0>; ^^ Should be: clocks = <&usb2_lpcg IMX_LPCG_CLK_6>; }; usbphy1: usbphy@5b100000 { clocks = <&usb2_lpcg 1>; ^^ SHould be: clocks = <&usb2_lpcg IMX_LPCG_CLK_7>; }; Arg0 is divided by 4 in lpcg driver. So lpcg will do dummy enable. Fix it by use correct clock indices. Cc: stable@vger.kernel.org Fixes: 8065fc937f0f ("arm64: dts: imx8dxl: add usb1 and usb2 support") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi index af2259e99796..4aaf5a0c1ed8 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi @@ -41,7 +41,7 @@ conn_subsys: bus@5b000000 { interrupts = ; fsl,usbphy = <&usbphy1>; fsl,usbmisc = <&usbmisc1 0>; - clocks = <&usb2_lpcg 0>; + clocks = <&usb2_lpcg IMX_LPCG_CLK_6>; ahb-burst-config = <0x0>; tx-burst-size-dword = <0x10>; rx-burst-size-dword = <0x10>; @@ -58,7 +58,7 @@ conn_subsys: bus@5b000000 { usbphy1: usbphy@5b100000 { compatible = "fsl,imx7ulp-usbphy"; reg = <0x5b100000 0x1000>; - clocks = <&usb2_lpcg 1>; + clocks = <&usb2_lpcg IMX_LPCG_CLK_7>; power-domains = <&pd IMX_SC_R_USB_0_PHY>; status = "disabled"; }; From f72b544a514c07d34a0d9d5380f5905b3731e647 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:05 -0400 Subject: [PATCH 036/270] arm64: dts: imx8-ss-dma: fix spi lpcg indices spi0_lpcg: clock-controller@5a400000 { ... Col0 Col1 clocks = <&clk IMX_SC_R_SPI_0 IMX_SC_PM_CLK_PER>,// 0 1 <&dma_ipg_clk>; // 1 4 clock-indices = , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. lpspi0: spi@5a000000 { ... clocks = <&spi0_lpcg 0>, <&spi0_lpcg 1>; ^ ^ Should be: clocks = <&spi0_lpcg IMX_LPCG_CLK_0>, <&spi0_lpcg IMX_LPCG_CLK_4>; }; Arg0 is divided by 4 in lpcg driver. <&spi0_lpcg 0> and <&spi0_lpcg 1> are IMX_SC_PM_CLK_PER. Although code can work, code logic is wrong. It should use IMX_LPCG_CLK_0 and IMX_LPCG_CLK_4 for lpcg arg0. Cc: stable@vger.kernel.org Fixes: c4098885e790 ("arm64: dts: imx8dxl: add lpspi support") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index cab3468b1875..169cf885c744 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -28,8 +28,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi0_lpcg 0>, - <&spi0_lpcg 1>; + clocks = <&spi0_lpcg IMX_LPCG_CLK_0>, + <&spi0_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; @@ -44,8 +44,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi1_lpcg 0>, - <&spi1_lpcg 1>; + clocks = <&spi1_lpcg IMX_LPCG_CLK_0>, + <&spi1_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_1 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; @@ -60,8 +60,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi2_lpcg 0>, - <&spi2_lpcg 1>; + clocks = <&spi2_lpcg IMX_LPCG_CLK_0>, + <&spi2_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_2 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; @@ -76,8 +76,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi3_lpcg 0>, - <&spi3_lpcg 1>; + clocks = <&spi3_lpcg IMX_LPCG_CLK_0>, + <&spi3_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_3 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; From 9055d87bce7276234173fa90e9702af31b3f5353 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:06 -0400 Subject: [PATCH 037/270] arm64: dts: imx8-ss-dma: fix pwm lpcg indices adma_pwm_lpcg: clock-controller@5a590000 { ... col1 col2 clocks = <&clk IMX_SC_R_LCD_0_PWM_0 IMX_SC_PM_CLK_PER>,// 0 0 <&dma_ipg_clk>; // 1 4 clock-indices = , ; ... }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. adma_pwm: pwm@5a190000 { ... clocks = <&adma_pwm_lpcg 1>, <&adma_pwm_lpcg 0>; ^^ ^^ Should be clocks = <&adma_pwm_lpcg IMX_LPCG_CLK_4>, <&adma_pwm_lpcg IMX_LPCG_CLK_0>; }; Arg0 will be divided by 4 in lcpg driver, so pwm will get IMX_SC_PM_CLK_PER by <&adma_pwm_lpcg 1>, <&adma_pwm_lpcg 0>. Although function can work, code logic is wrong. Fix it by use correct indices. Cc: stable@vger.kernel.org Fixes: f1d6a6b991ef ("arm64: dts: imx8qxp: add adma_pwm in adma") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index 169cf885c744..e3e911e57790 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -145,8 +145,8 @@ dma_subsys: bus@5a000000 { compatible = "fsl,imx8qxp-pwm", "fsl,imx27-pwm"; reg = <0x5a190000 0x1000>; interrupts = ; - clocks = <&adma_pwm_lpcg 1>, - <&adma_pwm_lpcg 0>; + clocks = <&adma_pwm_lpcg IMX_LPCG_CLK_4>, + <&adma_pwm_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_LCD_0_PWM_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; From 81975080f14167610976e968e8016e92d836266f Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:07 -0400 Subject: [PATCH 038/270] arm64: dts: imx8-ss-dma: fix adc lpcg indices adc0_lpcg: clock-controller@5ac80000 { ... Col1 Col2 clocks = <&clk IMX_SC_R_ADC_0 IMX_SC_PM_CLK_PER>, // 0 0 <&dma_ipg_clk>; // 1 4 clock-indices = , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. adc0: adc@5a880000 { clocks = <&adc0_lpcg 0>, <&adc0_lpcg 1>; ^^ ^^ clocks = <&adc0_lpcg IMX_LPCG_CLK_0>, <&adc0_lpcg IMX_LPCG_CLK_4>; Arg0 is divided by 4 in lpcg driver. So adc get IMX_SC_PM_CLK_PER by <&adc0_lpcg 0>, <&adc0_lpcg 1>. Although function can work, code logic is wrong. Fix it by using correct indices. Cc: stable@vger.kernel.org Fixes: 1db044b25d2e ("arm64: dts: imx8dxl: add adc0 support") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index e3e911e57790..23d59b34bc4c 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -355,8 +355,8 @@ dma_subsys: bus@5a000000 { reg = <0x5a880000 0x10000>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&adc0_lpcg 0>, - <&adc0_lpcg 1>; + clocks = <&adc0_lpcg IMX_LPCG_CLK_0>, + <&adc0_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_ADC_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; @@ -370,8 +370,8 @@ dma_subsys: bus@5a000000 { reg = <0x5a890000 0x10000>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&adc1_lpcg 0>, - <&adc1_lpcg 1>; + clocks = <&adc1_lpcg IMX_LPCG_CLK_0>, + <&adc1_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_ADC_1 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; From 0893392334b5dffdf616a53679c6a2942c46391b Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:08 -0400 Subject: [PATCH 039/270] arm64: dts: imx8-ss-dma: fix can lpcg indices can0_lpcg: clock-controller@5acd0000 { ... Col1 Col2 clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>, // 0 0 <&dma_ipg_clk>, // 1 4 <&dma_ipg_clk>; // 2 5 clock-indices = , , ; } Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. flexcan1: can@5a8d0000 { clocks = <&can0_lpcg 1>, <&can0_lpcg 0>; ^^ ^^ Should be: clocks = <&can0_lpcg IMX_LPCG_CLK_4>, <&can0_lpcg IMX_LPCG_CLK_0>; }; Arg0 is divided by 4 in lpcg driver. flexcan driver get IMX_SC_PM_CLK_PER by <&can0_lpcg 1> and <&can0_lpcg 0>. Although function can work, code logic is wrong. Fix it by using correct clock indices. Cc: stable@vger.kernel.org Fixes: 5e7d5b023e03 ("arm64: dts: imx8qxp: add flexcan in adma") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index 23d59b34bc4c..f7a91d43a0ff 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -384,8 +384,8 @@ dma_subsys: bus@5a000000 { reg = <0x5a8d0000 0x10000>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&can0_lpcg 1>, - <&can0_lpcg 0>; + clocks = <&can0_lpcg IMX_LPCG_CLK_4>, + <&can0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <40000000>; @@ -405,8 +405,8 @@ dma_subsys: bus@5a000000 { * CAN1 shares CAN0's clock and to enable CAN0's clock it * has to be powered on. */ - clocks = <&can0_lpcg 1>, - <&can0_lpcg 0>; + clocks = <&can0_lpcg IMX_LPCG_CLK_4>, + <&can0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <40000000>; @@ -426,8 +426,8 @@ dma_subsys: bus@5a000000 { * CAN2 shares CAN0's clock and to enable CAN0's clock it * has to be powered on. */ - clocks = <&can0_lpcg 1>, - <&can0_lpcg 0>; + clocks = <&can0_lpcg IMX_LPCG_CLK_4>, + <&can0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <40000000>; From 00b436182138310bb8d362b912b12a9df8f72ca3 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:09 -0400 Subject: [PATCH 040/270] arm64: dts: imx8qm-ss-dma: fix can lpcg indices can1_lpcg: clock-controller@5ace0000 { ... Col1 Col2 clocks = <&clk IMX_SC_R_CAN_1 IMX_SC_PM_CLK_PER>,// 0 0 <&dma_ipg_clk>, // 1 4 <&dma_ipg_clk>; // 2 5 clock-indices = , , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver &flexcan2 { clocks = <&can1_lpcg 1>, <&can1_lpcg 0>; ^^ ^^ Should be: clocks = <&can1_lpcg IMX_LPCG_CLK_4>, <&can1_lpcg IMX_LPCG_CLK_0>; }; Arg0 is divided by 4 in lpcg driver. So flexcan get IMX_SC_PM_CLK_PER by <&can1_lpcg 1> and <&can1_lpcg 0>. Although function work, code logic is wrong. Fix it by using correct clock indices. Cc: stable@vger.kernel.org Fixes: be85831de020 ("arm64: dts: imx8qm: add can node in devicetree") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi index 11626fae5f97..aa9f28c4431d 100644 --- a/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi @@ -153,15 +153,15 @@ }; &flexcan2 { - clocks = <&can1_lpcg 1>, - <&can1_lpcg 0>; + clocks = <&can1_lpcg IMX_LPCG_CLK_4>, + <&can1_lpcg IMX_LPCG_CLK_0>; assigned-clocks = <&clk IMX_SC_R_CAN_1 IMX_SC_PM_CLK_PER>; fsl,clk-source = /bits/ 8 <1>; }; &flexcan3 { - clocks = <&can2_lpcg 1>, - <&can2_lpcg 0>; + clocks = <&can2_lpcg IMX_LPCG_CLK_4>, + <&can2_lpcg IMX_LPCG_CLK_0>; assigned-clocks = <&clk IMX_SC_R_CAN_2 IMX_SC_PM_CLK_PER>; fsl,clk-source = /bits/ 8 <1>; }; From f7c52345ccc96343c0a05bdea3121c8ac7b67d5f Mon Sep 17 00:00:00 2001 From: Kwangjin Ko Date: Tue, 2 Apr 2024 17:14:03 +0900 Subject: [PATCH 041/270] cxl/core: Fix initialization of mbox_cmd.size_out in get event Since mbox_cmd.size_out is overwritten with the actual output size in the function below, it needs to be initialized every time. cxl_internal_send_cmd -> __cxl_pci_mbox_send_cmd Problem scenario: 1) The size_out variable is initially set to the size of the mailbox. 2) Read an event. - size_out is set to 160 bytes(header 32B + one event 128B). - Two event are created while reading. 3) Read the new *two* events. - size_out is still set to 160 bytes. - Although the value of out_len is 288 bytes, only 160 bytes are copied from the mailbox register to the local variable. - record_count is set to 2. - Accessing records[1] will result in reading incorrect data. Fixes: 6ebe28f9ec72 ("cxl/mem: Read, trace, and clear events on driver load") Tested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Signed-off-by: Kwangjin Ko Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 50146161887d..f0f54aeccc87 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -958,13 +958,14 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, .payload_in = &log_type, .size_in = sizeof(log_type), .payload_out = payload, - .size_out = mds->payload_size, .min_out = struct_size(payload, records, 0), }; do { int rc, i; + mbox_cmd.size_out = mds->payload_size; + rc = cxl_internal_send_cmd(mds, &mbox_cmd); if (rc) { dev_err_ratelimited(dev, From 1fc9af813b25e146d3607669247d0f970f5a87c3 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 5 Jan 2024 21:46:11 +0300 Subject: [PATCH 042/270] drm/panfrost: Fix the error path in panfrost_mmu_map_fault_addr() Subject: [PATCH] drm/panfrost: Fix the error path in panfrost_mmu_map_fault_addr() If some the pages or sgt allocation failed, we shouldn't release the pages ref we got earlier, otherwise we will end up with unbalanced get/put_pages() calls. We should instead leave everything in place and let the BO release function deal with extra cleanup when the object is destroyed, or let the fault handler try again next time it's called. Fixes: 187d2929206e ("drm/panfrost: Add support for GPU heap allocations") Cc: Reviewed-by: Steven Price Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Boris Brezillon Co-developed-by: Dmitry Osipenko Signed-off-by: Dmitry Osipenko Link: https://patchwork.freedesktop.org/patch/msgid/20240105184624.508603-18-dmitry.osipenko@collabora.com --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index f38385fe76bb..b91019cd5acb 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -502,11 +502,18 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, mapping_set_unevictable(mapping); for (i = page_offset; i < page_offset + NUM_FAULT_PAGES; i++) { + /* Can happen if the last fault only partially filled this + * section of the pages array before failing. In that case + * we skip already filled pages. + */ + if (pages[i]) + continue; + pages[i] = shmem_read_mapping_page(mapping, i); if (IS_ERR(pages[i])) { ret = PTR_ERR(pages[i]); pages[i] = NULL; - goto err_pages; + goto err_unlock; } } @@ -514,7 +521,7 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, ret = sg_alloc_table_from_pages(sgt, pages + page_offset, NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL); if (ret) - goto err_pages; + goto err_unlock; ret = dma_map_sgtable(pfdev->dev, sgt, DMA_BIDIRECTIONAL, 0); if (ret) @@ -537,8 +544,6 @@ out: err_map: sg_free_table(sgt); -err_pages: - drm_gem_shmem_put_pages(&bo->base); err_unlock: dma_resv_unlock(obj->resv); err_bo: From 1a4ea83a6e67f1415a1f17c1af5e9c814c882bb5 Mon Sep 17 00:00:00 2001 From: Yuanhe Shu Date: Mon, 26 Feb 2024 11:18:16 +0800 Subject: [PATCH 043/270] selftests/ftrace: Limit length in subsystem-enable tests While sched* events being traced and sched* events continuously happen, "[xx] event tracing - enable/disable with subsystem level files" would not stop as on some slower systems it seems to take forever. Select the first 100 lines of output would be enough to judge whether there are more than 3 types of sched events. Fixes: 815b18ea66d6 ("ftracetest: Add basic event tracing test cases") Cc: stable@vger.kernel.org Signed-off-by: Yuanhe Shu Acked-by: Masami Hiramatsu (Google) Acked-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- .../selftests/ftrace/test.d/event/subsystem-enable.tc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc index b1ede6249866..b7c8f29c09a9 100644 --- a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc +++ b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc @@ -18,7 +18,7 @@ echo 'sched:*' > set_event yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -29,7 +29,7 @@ echo 1 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -40,7 +40,7 @@ echo 0 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -ne 0 ]; then fail "any of scheduler events should not be recorded" fi From 72d7cb5c190befbb095bae7737e71560ec0fcaa6 Mon Sep 17 00:00:00 2001 From: Shengyu Li Date: Wed, 27 Mar 2024 05:13:15 +0800 Subject: [PATCH 044/270] selftests/harness: Prevent infinite loop due to Assert in FIXTURE_TEARDOWN This patch addresses an issue in the selftests/harness where an assertion within FIXTURE_TEARDOWN could trigger an infinite loop. The problem arises because the teardown procedure is meant to execute once, but the presence of failing assertions (ASSERT_EQ(0, 1)) leads to repeated attempts to execute teardown due to the long jump mechanism used by the harness for handling assertions. To resolve this, the patch ensures that the teardown process runs only once, regardless of assertion outcomes, preventing the infinite loop and allowing tests to fail. A simple test demo(test.c): #include "kselftest_harness.h" FIXTURE(f) { int fd; }; FIXTURE_SETUP(f) { self->fd = 0; } FIXTURE_TEARDOWN(f) { TH_LOG("TEARDOWN"); ASSERT_EQ(0, 1); self->fd = -1; } TEST_F(f, open_close) { ASSERT_NE(self->fd, 1); } TEST_HARNESS_MAIN will always output the following output due to a dead loop until timeout: # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) ... But here's what we should and expect to get: TAP version 13 1..1 # Starting 1 tests from 2 test cases. # RUN f.open_close ... # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) # open_close: Test terminated by assertion # FAIL f.open_close not ok 1 f.open_close # FAILED: 0 / 1 tests passed. # Totals: pass:0 fail:1 xfail:0 xpass:0 skip:0 error:0 also this is related to the issue mentioned in this patch https://patchwork.kernel.org/project/linux-kselftest/patch/e2ba3f8c-80e6-477d-9cea-1c9af820e0ed@alu.unizg.hr/ Signed-off-by: Shengyu Li Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_harness.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4fd735e48ee7..230d62884885 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -383,6 +383,7 @@ FIXTURE_DATA(fixture_name) self; \ pid_t child = 1; \ int status = 0; \ + bool jmp = false; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ if (setjmp(_metadata->env) == 0) { \ /* Use the same _metadata. */ \ @@ -399,8 +400,10 @@ _metadata->exit_code = KSFT_FAIL; \ } \ } \ + else \ + jmp = true; \ if (child == 0) { \ - if (_metadata->setup_completed && !_metadata->teardown_parent) \ + if (_metadata->setup_completed && !_metadata->teardown_parent && !jmp) \ fixture_name##_teardown(_metadata, &self, variant->data); \ _exit(0); \ } \ From bc004f5038220b1891ef4107134ccae44be55109 Mon Sep 17 00:00:00 2001 From: Jammy Huang Date: Wed, 3 Apr 2024 17:02:46 +0800 Subject: [PATCH 045/270] drm/ast: Fix soft lockup There is a while-loop in ast_dp_set_on_off() that could lead to infinite-loop. This is because the register, VGACRI-Dx, checked in this API is a scratch register actually controlled by a MCU, named DPMCU, in BMC. These scratch registers are protected by scu-lock. If suc-lock is not off, DPMCU can not update these registers and then host will have soft lockup due to never updated status. DPMCU is used to control DP and relative registers to handshake with host's VGA driver. Even the most time-consuming task, DP's link training, is less than 100ms. 200ms should be enough. Signed-off-by: Jammy Huang Fixes: 594e9c04b586 ("drm/ast: Create the driver for ASPEED proprietory Display-Port") Reviewed-by: Jocelyn Falempe Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Cc: KuoHsiang Chou Cc: Thomas Zimmermann Cc: Dave Airlie Cc: Jocelyn Falempe Cc: dri-devel@lists.freedesktop.org Cc: # v5.19+ Link: https://patchwork.freedesktop.org/patch/msgid/20240403090246.1495487-1-jammy_huang@aspeedtech.com --- drivers/gpu/drm/ast/ast_dp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/ast/ast_dp.c b/drivers/gpu/drm/ast/ast_dp.c index ebb6d8ebd44e..1e9259416980 100644 --- a/drivers/gpu/drm/ast/ast_dp.c +++ b/drivers/gpu/drm/ast/ast_dp.c @@ -180,6 +180,7 @@ void ast_dp_set_on_off(struct drm_device *dev, bool on) { struct ast_device *ast = to_ast_device(dev); u8 video_on_off = on; + u32 i = 0; // Video On/Off ast_set_index_reg_mask(ast, AST_IO_VGACRI, 0xE3, (u8) ~AST_DP_VIDEO_ENABLE, on); @@ -192,6 +193,8 @@ void ast_dp_set_on_off(struct drm_device *dev, bool on) ASTDP_MIRROR_VIDEO_ENABLE) != video_on_off) { // wait 1 ms mdelay(1); + if (++i > 200) + break; } } } From 07ed11afb68d94eadd4ffc082b97c2331307c5ea Mon Sep 17 00:00:00 2001 From: Alex Constantino Date: Thu, 4 Apr 2024 19:14:48 +0100 Subject: [PATCH 046/270] Revert "drm/qxl: simplify qxl_fence_wait" This reverts commit 5a838e5d5825c85556011478abde708251cc0776. Changes from commit 5a838e5d5825 ("drm/qxl: simplify qxl_fence_wait") would result in a '[TTM] Buffer eviction failed' exception whenever it reached a timeout. Due to a dependency to DMA_FENCE_WARN this also restores some code deleted by commit d72277b6c37d ("dma-buf: nuke DMA_FENCE_TRACE macros v2"). Fixes: 5a838e5d5825 ("drm/qxl: simplify qxl_fence_wait") Link: https://lore.kernel.org/regressions/ZTgydqRlK6WX_b29@eldamar.lan/ Reported-by: Timo Lindfors Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1054514 Signed-off-by: Alex Constantino Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20240404181448.1643-2-dreaming.about.electric.sheep@gmail.com --- drivers/gpu/drm/qxl/qxl_release.c | 50 +++++++++++++++++++++++++++---- include/linux/dma-fence.h | 7 +++++ 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index 368d26da0d6a..9febc8b73f09 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -58,16 +58,56 @@ static long qxl_fence_wait(struct dma_fence *fence, bool intr, signed long timeout) { struct qxl_device *qdev; + struct qxl_release *release; + int count = 0, sc = 0; + bool have_drawable_releases; unsigned long cur, end = jiffies + timeout; qdev = container_of(fence->lock, struct qxl_device, release_lock); + release = container_of(fence, struct qxl_release, base); + have_drawable_releases = release->type == QXL_RELEASE_DRAWABLE; - if (!wait_event_timeout(qdev->release_event, - (dma_fence_is_signaled(fence) || - (qxl_io_notify_oom(qdev), 0)), - timeout)) - return 0; +retry: + sc++; + if (dma_fence_is_signaled(fence)) + goto signaled; + + qxl_io_notify_oom(qdev); + + for (count = 0; count < 11; count++) { + if (!qxl_queue_garbage_collect(qdev, true)) + break; + + if (dma_fence_is_signaled(fence)) + goto signaled; + } + + if (dma_fence_is_signaled(fence)) + goto signaled; + + if (have_drawable_releases || sc < 4) { + if (sc > 2) + /* back off */ + usleep_range(500, 1000); + + if (time_after(jiffies, end)) + return 0; + + if (have_drawable_releases && sc > 300) { + DMA_FENCE_WARN(fence, + "failed to wait on release %llu after spincount %d\n", + fence->context & ~0xf0000000, sc); + goto signaled; + } + goto retry; + } + /* + * yeah, original sync_obj_wait gave up after 3 spins when + * have_drawable_releases is not set. + */ + +signaled: cur = jiffies; if (time_after(cur, end)) return 0; diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index e06bad467f55..c3f9bb6602ba 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -682,4 +682,11 @@ static inline bool dma_fence_is_container(struct dma_fence *fence) return dma_fence_is_array(fence) || dma_fence_is_chain(fence); } +#define DMA_FENCE_WARN(f, fmt, args...) \ + do { \ + struct dma_fence *__ff = (f); \ + pr_warn("f %llu#%llu: " fmt, __ff->context, __ff->seqno,\ + ##args); \ + } while (0) + #endif /* __LINUX_DMA_FENCE_H */ From 24cfd86433c920188ac3f02df8aba6bc4c792f4b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Apr 2024 18:30:14 +0900 Subject: [PATCH 047/270] ata: ahci: Add mask_port_map module parameter Commits 0077a504e1a4 ("ahci: asm1166: correct count of reported ports") and 9815e3961754 ("ahci: asm1064: correct count of reported ports") attempted to limit the ports of the ASM1166 and ASM1064 AHCI controllers to avoid long boot times caused by the fact that these adapters report a port map larger than the number of physical ports. The excess ports are "virtual" to hide port multiplier devices and probing these ports takes time. However, these commits caused a regression for users that do use PMP devices, as the ATA devices connected to the PMP cannot be scanned. These commits have thus been reverted by commit 6cd8adc3e18 ("ahci: asm1064: asm1166: don't limit reported ports") to allow the discovery of devices connected through a port multiplier. But this revert re-introduced the long boot times for users that do not use a port multiplier setup. This patch adds the mask_port_map ahci module parameter to allow users to manually specify port map masks for controllers. In the case of the ASMedia 1166 and 1064 controllers, users that do not have port multiplier devices can mask the excess virtual ports exposed by the controller to speedup port scanning, thus reducing boot time. The mask_port_map parameter accepts 2 different formats: - mask_port_map= This applies the same mask to all AHCI controllers present in the system. This format is convenient for small systems that have only a single AHCI controller. - mask_port_map==,=mask,... This applies the specified masks only to the PCI device listed. The field is a regular PCI device ID (domain:bus:dev.func). This ID can be seen following "ahci" in the kernel messages. E.g. for "ahci 0000:01:00.0: 2/2 ports implemented (port mask 0x3)", the field is "0000:01:00.0". When used, the function ahci_save_initial_config() indicates that a port map mask was applied with the message "masking port_map ...". E.g.: without a mask: modprobe ahci dmesg | grep ahci ... ahci 0000:00:17.0: AHCI vers 0001.0301, 32 command slots, 6 Gbps, SATA mode ahci 0000:00:17.0: (0000:00:17.0) 8/8 ports implemented (port mask 0xff) With a mask: modprobe ahci mask_port_map=0000:00:17.0=0x1 dmesg | grep ahci ... ahci 0000:00:17.0: masking port_map 0xff -> 0x1 ahci 0000:00:17.0: AHCI vers 0001.0301, 32 command slots, 6 Gbps, SATA mode ahci 0000:00:17.0: (0000:00:17.0) 1/8 ports implemented (port mask 0x1) Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel --- drivers/ata/ahci.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 562302e2e57c..6548f10e61d9 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -666,6 +666,87 @@ static int mobile_lpm_policy = -1; module_param(mobile_lpm_policy, int, 0644); MODULE_PARM_DESC(mobile_lpm_policy, "Default LPM policy for mobile chipsets"); +static char *ahci_mask_port_map; +module_param_named(mask_port_map, ahci_mask_port_map, charp, 0444); +MODULE_PARM_DESC(mask_port_map, + "32-bits port map masks to ignore controllers ports. " + "Valid values are: " + "\"\" to apply the same mask to all AHCI controller " + "devices, and \"=,=,...\" to " + "specify different masks for the controllers specified, " + "where is the PCI ID of an AHCI controller in the " + "form \"domain:bus:dev.func\""); + +static void ahci_apply_port_map_mask(struct device *dev, + struct ahci_host_priv *hpriv, char *mask_s) +{ + unsigned int mask; + + if (kstrtouint(mask_s, 0, &mask)) { + dev_err(dev, "Invalid port map mask\n"); + return; + } + + hpriv->mask_port_map = mask; +} + +static void ahci_get_port_map_mask(struct device *dev, + struct ahci_host_priv *hpriv) +{ + char *param, *end, *str, *mask_s; + char *name; + + if (!strlen(ahci_mask_port_map)) + return; + + str = kstrdup(ahci_mask_port_map, GFP_KERNEL); + if (!str) + return; + + /* Handle single mask case */ + if (!strchr(str, '=')) { + ahci_apply_port_map_mask(dev, hpriv, str); + goto free; + } + + /* + * Mask list case: parse the parameter to apply the mask only if + * the device name matches. + */ + param = str; + end = param + strlen(param); + while (param && param < end && *param) { + name = param; + param = strchr(name, '='); + if (!param) + break; + + *param = '\0'; + param++; + if (param >= end) + break; + + if (strcmp(dev_name(dev), name) != 0) { + param = strchr(param, ','); + if (param) + param++; + continue; + } + + mask_s = param; + param = strchr(mask_s, ','); + if (param) { + *param = '\0'; + param++; + } + + ahci_apply_port_map_mask(dev, hpriv, mask_s); + } + +free: + kfree(str); +} + static void ahci_pci_save_initial_config(struct pci_dev *pdev, struct ahci_host_priv *hpriv) { @@ -688,6 +769,10 @@ static void ahci_pci_save_initial_config(struct pci_dev *pdev, "Disabling your PATA port. Use the boot option 'ahci.marvell_enable=0' to avoid this.\n"); } + /* Handle port map masks passed as module parameter. */ + if (ahci_mask_port_map) + ahci_get_port_map_mask(&pdev->dev, hpriv); + ahci_save_initial_config(&pdev->dev, hpriv); } From 648dae58a830ecceea3b1bebf68432435980f137 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:12 -0700 Subject: [PATCH 048/270] cxl: Remove checking of iter in cxl_endpoint_get_perf_coordinates() The while() loop in cxl_endpoint_get_perf_coordinates() checks to see if 'iter' is valid as part of the condition breaking out of the loop. is_cxl_root() will stop the loop before the next iteration could go NULL. Remove the iter check. The presence of the iter or removing the iter does not impact the behavior of the code. This is a code clean up and not a bug fix. Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 2b0cab556072..6cbde50a742b 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2197,7 +2197,7 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, * port each iteration. If the parent is cxl root then there is * nothing to gather. */ - while (iter && !is_cxl_root(to_cxl_port(iter->dev.parent))) { + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) { cxl_coordinates_combine(&c, &c, &dport->sw_coord); c.write_latency += dport->link_latency; c.read_latency += dport->link_latency; From 838ae9f45c4e43b4633d8b0ad1fbedff9ecf177d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 30 Mar 2024 07:12:03 -0700 Subject: [PATCH 049/270] nouveau/gsp: Avoid addressing beyond end of rpc->entries Using the end of rpc->entries[] for addressing runs into both compile-time and run-time detection of accessing beyond the end of the array. Use the base pointer instead, since was allocated with the additional bytes for storing the strings. Avoids the following warning in future GCC releases with support for __counted_by: In function 'fortify_memcpy_chk', inlined from 'r535_gsp_rpc_set_registry' at ../drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c:1123:3: ../include/linux/fortify-string.h:553:25: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] 553 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ for this code: strings = (char *)&rpc->entries[NV_GSP_REG_NUM_ENTRIES]; ... memcpy(strings, r535_registry_entries[i].name, name_len); Signed-off-by: Kees Cook Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240330141159.work.063-kees@kernel.org --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index 9994cbd6f1c4..9858c1438aa7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -1112,7 +1112,7 @@ r535_gsp_rpc_set_registry(struct nvkm_gsp *gsp) rpc->numEntries = NV_GSP_REG_NUM_ENTRIES; str_offset = offsetof(typeof(*rpc), entries[NV_GSP_REG_NUM_ENTRIES]); - strings = (char *)&rpc->entries[NV_GSP_REG_NUM_ENTRIES]; + strings = (char *)rpc + str_offset; for (i = 0; i < NV_GSP_REG_NUM_ENTRIES; i++) { int name_len = strlen(r535_registry_entries[i].name) + 1; From 185fdb4697cc9684a02f2fab0530ecdd0c2f15d4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Apr 2024 18:02:25 +0200 Subject: [PATCH 050/270] nouveau: fix function cast warning Calling a function through an incompatible pointer type causes breaks kcfi, so clang warns about the assignment: drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c:73:10: error: cast from 'void (*)(const void *)' to 'void (*)(void *)' converts to incompatible function type [-Werror,-Wcast-function-type-strict] 73 | .fini = (void(*)(void *))kfree, Avoid this with a trivial wrapper. Fixes: c39f472e9f14 ("drm/nouveau: remove symlinks, move core/ to nvkm/ (no code changes)") Signed-off-by: Arnd Bergmann Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240404160234.2923554-1-arnd@kernel.org --- drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c index 4bf486b57101..cb05f7f48a98 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c @@ -66,11 +66,16 @@ of_init(struct nvkm_bios *bios, const char *name) return ERR_PTR(-EINVAL); } +static void of_fini(void *p) +{ + kfree(p); +} + const struct nvbios_source nvbios_of = { .name = "OpenFirmware", .init = of_init, - .fini = (void(*)(void *))kfree, + .fini = of_fini, .read = of_read, .size = of_size, .rw = false, From 0c3b532ad3fbf82884a2e7e83e37c7dcdd4d1d99 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Apr 2024 19:25:21 +0300 Subject: [PATCH 051/270] gpio: wcove: Use -ENOTSUPP consistently The GPIO library expects the drivers to return -ENOTSUPP in some cases and not using analogue POSIX code. Make the driver to follow this. Reviewed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Andy Shevchenko --- drivers/gpio/gpio-wcove.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-wcove.c b/drivers/gpio/gpio-wcove.c index c18b6b47384f..94ca9d03c094 100644 --- a/drivers/gpio/gpio-wcove.c +++ b/drivers/gpio/gpio-wcove.c @@ -104,7 +104,7 @@ static inline int to_reg(int gpio, enum ctrl_register type) unsigned int reg = type == CTRL_IN ? GPIO_IN_CTRL_BASE : GPIO_OUT_CTRL_BASE; if (gpio >= WCOVE_GPIO_NUM) - return -EOPNOTSUPP; + return -ENOTSUPP; return reg + gpio; } From ace0ebe5c98d66889f19e0f30e2518d0c58d0e04 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Apr 2024 19:26:22 +0300 Subject: [PATCH 052/270] gpio: crystalcove: Use -ENOTSUPP consistently The GPIO library expects the drivers to return -ENOTSUPP in some cases and not using analogue POSIX code. Make the driver to follow this. Signed-off-by: Andy Shevchenko --- drivers/gpio/gpio-crystalcove.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-crystalcove.c b/drivers/gpio/gpio-crystalcove.c index 1ee62cd58582..25db014494a4 100644 --- a/drivers/gpio/gpio-crystalcove.c +++ b/drivers/gpio/gpio-crystalcove.c @@ -92,7 +92,7 @@ static inline int to_reg(int gpio, enum ctrl_register reg_type) case 0x5e: return GPIOPANELCTL; default: - return -EOPNOTSUPP; + return -ENOTSUPP; } } From d3bbc4dfcc8d436b1e27a0bb6a5893b090fc1cab Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 26 Mar 2024 22:23:24 +0100 Subject: [PATCH 053/270] drm/msm: fix the `CRASHDUMP_READ` target of `a6xx_get_shader_block()` Clang 14 in an (essentially) defconfig arm64 build for next-20240326 reports [1]: drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c:843:6: error: variable 'out' set but not used [-Werror,-Wunused-but-set-variable] The variable `out` in these functions is meant to compute the `target` of `CRASHDUMP_READ()`, but in this case only the initial value (`dumper->iova + A6XX_CD_DATA_OFFSET`) was being passed. Thus use `out` as it was intended by Connor [2]. There was an alternative patch at [3] that removed the variable altogether, but that would only use the initial value. Fixes: 64d6255650d4 ("drm/msm: More fully implement devcoredump for a7xx") Closes: https://lore.kernel.org/lkml/CANiq72mjc5t4n25SQvYSrOEhxxpXYPZ4pPzneSJHEnc3qApu2Q@mail.gmail.com/ [1] Link: https://lore.kernel.org/lkml/CACu1E7HhCKMJd6fixZSPiNAz6ekoZnkMTHTcLFVmbZ-9VoLxKg@mail.gmail.com/ [2] Link: https://lore.kernel.org/lkml/20240307093727.1978126-1-colin.i.king@gmail.com/ [3] Signed-off-by: Miguel Ojeda Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/584955/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c index 1f5245fc2cdc..a847a0f7a73c 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c @@ -852,7 +852,7 @@ static void a6xx_get_shader_block(struct msm_gpu *gpu, (block->type << 8) | i); in += CRASHDUMP_READ(in, REG_A6XX_HLSQ_DBG_AHB_READ_APERTURE, - block->size, dumper->iova + A6XX_CD_DATA_OFFSET); + block->size, out); out += block->size * sizeof(u32); } From 9dc23cba0927d09cb481da064c8413eb9df42e2b Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Thu, 28 Mar 2024 09:02:45 +0100 Subject: [PATCH 054/270] drm/msm/adreno: Set highest_bank_bit for A619 The default highest_bank_bit of 15 didn't seem to cause issues so far but downstream defines it to be 14. But similar to [0] leaving it on 14 (or 15 for that matter) causes some corruption issues with some resolutions with DisplayPort, like 1920x1200. So set it to 13 for now so that there's no screen corruption. [0] commit 6a0dbcd20ef2 ("drm/msm/a6xx: set highest_bank_bit to 13 for a610") Fixes: b7616b5c69e6 ("drm/msm/adreno: Add A619 support") Signed-off-by: Luca Weiss Patchwork: https://patchwork.freedesktop.org/patch/585215/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 0674aca0f8a3..cf0b1de1c071 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1377,6 +1377,10 @@ static void a6xx_calc_ubwc_config(struct adreno_gpu *gpu) if (adreno_is_a618(gpu)) gpu->ubwc_config.highest_bank_bit = 14; + if (adreno_is_a619(gpu)) + /* TODO: Should be 14 but causes corruption at e.g. 1920x1200 on DP */ + gpu->ubwc_config.highest_bank_bit = 13; + if (adreno_is_a619_holi(gpu)) gpu->ubwc_config.highest_bank_bit = 13; From 978e5c19dfefc271e5550efba92fcef0d3f62864 Mon Sep 17 00:00:00 2001 From: Alexey Izbyshev Date: Fri, 5 Apr 2024 15:55:51 +0300 Subject: [PATCH 055/270] io_uring: Fix io_cqring_wait() not restoring sigmask on get_timespec64() failure This bug was introduced in commit 950e79dd7313 ("io_uring: minor io_cqring_wait() optimization"), which was made in preparation for adc8682ec690 ("io_uring: Add support for napi_busy_poll"). The latter got reverted in cb3182167325 ("Revert "io_uring: Add support for napi_busy_poll""), so simply undo the former as well. Cc: stable@vger.kernel.org Fixes: 950e79dd7313 ("io_uring: minor io_cqring_wait() optimization") Signed-off-by: Alexey Izbyshev Link: https://lore.kernel.org/r/20240405125551.237142-1-izbyshev@ispras.ru Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4521c2b66b98..c170a2b8d2cf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2602,19 +2602,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (__io_cqring_events_user(ctx) >= min_events) return 0; - if (sig) { -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - sigsz); - else -#endif - ret = set_user_sigmask(sig, sigsz); - - if (ret) - return ret; - } - init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); @@ -2633,6 +2620,19 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, io_napi_adjust_timeout(ctx, &iowq, &ts); } + if (sig) { +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, + sigsz); + else +#endif + ret = set_user_sigmask(sig, sigsz); + + if (ret) + return ret; + } + io_napi_busy_loop(ctx, &iowq); trace_io_uring_cqring_wait(ctx, min_events); From beaa51b36012fad5a4d3c18b88a617aea7a9b96d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 4 Apr 2024 12:32:53 -0400 Subject: [PATCH 056/270] blk-iocost: avoid out of bounds shift UBSAN catches undefined behavior in blk-iocost, where sometimes iocg->delay is shifted right by a number that is too large, resulting in undefined behavior on some architectures. [ 186.556576] ------------[ cut here ]------------ UBSAN: shift-out-of-bounds in block/blk-iocost.c:1366:23 shift exponent 64 is too large for 64-bit type 'u64' (aka 'unsigned long long') CPU: 16 PID: 0 Comm: swapper/16 Tainted: G S E N 6.9.0-0_fbk700_debug_rc2_kbuilder_0_gc85af715cac0 #1 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A23 12/08/2020 Call Trace: dump_stack_lvl+0x8f/0xe0 __ubsan_handle_shift_out_of_bounds+0x22c/0x280 iocg_kick_delay+0x30b/0x310 ioc_timer_fn+0x2fb/0x1f80 __run_timer_base+0x1b6/0x250 ... Avoid that undefined behavior by simply taking the "delay = 0" branch if the shift is too large. I am not sure what the symptoms of an undefined value delay will be, but I suspect it could be more than a little annoying to debug. Signed-off-by: Rik van Riel Cc: Tejun Heo Cc: Josef Bacik Cc: Jens Axboe Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240404123253.0f58010f@imladris.surriel.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 9a85bfbbc45a..baa20c85799d 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1347,7 +1347,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 tdelta, delay, new_delay; + u64 tdelta, delay, new_delay, shift; s64 vover, vover_pct; u32 hwa; @@ -1362,8 +1362,9 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; - if (iocg->delay) - delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + shift = div64_u64(tdelta, USEC_PER_SEC); + if (iocg->delay && shift < BITS_PER_LONG) + delay = iocg->delay >> shift; else delay = 0; From 8b8ace080319a866f5dfe9da8e665ae51d971c54 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 7 Apr 2024 20:59:10 +0800 Subject: [PATCH 057/270] block: fix q->blkg_list corruption during disk rebind Multiple gendisk instances can allocated/added for single request queue in case of disk rebind. blkg may still stay in q->blkg_list when calling blkcg_init_disk() for rebind, then q->blkg_list becomes corrupted. Fix the list corruption issue by: - add blkg_init_queue() to initialize q->blkg_list & q->blkcg_mutex only - move calling blkg_init_queue() into blk_alloc_queue() The list corruption should be started since commit f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") which delays removing blkg from q->blkg_list into blkg_free_workfn(). Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") Fixes: 1059699f87eb ("block: move blkcg initialization/destroy into disk allocation/release handler") Cc: Yu Kuai Cc: Tejun Heo Signed-off-by: Ming Lei Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240407125910.4053377-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 ++++++--- block/blk-cgroup.h | 2 ++ block/blk-core.c | 2 ++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bdbb557feb5a..059467086b13 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1409,6 +1409,12 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) return 0; } +void blkg_init_queue(struct request_queue *q) +{ + INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); +} + int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -1416,9 +1422,6 @@ int blkcg_init_disk(struct gendisk *disk) bool preloaded; int ret; - INIT_LIST_HEAD(&q->blkg_list); - mutex_init(&q->blkcg_mutex); - new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 78b74106bf10..90b3959d88cf 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -189,6 +189,7 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; +void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); @@ -482,6 +483,7 @@ struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index a16b5abdbbf5..3a6f5603fb44 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -442,6 +442,8 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); + blkg_init_queue(q); + /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. From b561ea56a26415bf44ce8ca6a8e625c7c390f1ea Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 7 Apr 2024 21:19:31 +0800 Subject: [PATCH 058/270] block: allow device to have both virt_boundary_mask and max segment size When one stacking device is over one device with virt_boundary_mask and another one with max segment size, the stacking device have both limits set. This way is allowed before d690cb8ae14b ("block: add an API to atomically update queue limits"). Relax the limit so that we won't break such kind of stacking setting. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218687 Reported-by: janpieter.sollie@edpnet.be Fixes: d690cb8ae14b ("block: add an API to atomically update queue limits") Link: https://lore.kernel.org/linux-block/ZfGl8HzUpiOxCLm3@fedora/ Cc: Christoph Hellwig Cc: Mike Snitzer Cc: dm-devel@lists.linux.dev Cc: Song Liu Cc: linux-raid@vger.kernel.org Signed-off-by: Ming Lei Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20240407131931.4055231-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index cdbaef159c4b..d2731843f2fc 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -182,17 +182,13 @@ static int blk_validate_limits(struct queue_limits *lim) return -EINVAL; /* - * Devices that require a virtual boundary do not support scatter/gather - * I/O natively, but instead require a descriptor list entry for each - * page (which might not be identical to the Linux PAGE_SIZE). Because - * of that they are not limited by our notion of "segment size". + * Stacking device may have both virtual boundary and max segment + * size limit, so allow this setting now, and long-term the two + * might need to move out of stacking limits since we have immutable + * bvec and lower layer bio splitting is supposed to handle the two + * correctly. */ - if (lim->virt_boundary_mask) { - if (WARN_ON_ONCE(lim->max_segment_size && - lim->max_segment_size != UINT_MAX)) - return -EINVAL; - lim->max_segment_size = UINT_MAX; - } else { + if (!lim->virt_boundary_mask) { /* * The maximum segment size has an odd historic 64k default that * drivers probably should override. Just like the I/O size we From 22e1992cf7b034db5325660e98c41ca5afa5f519 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 28 Mar 2024 10:21:47 +1000 Subject: [PATCH 059/270] vhost: Add smp_rmb() in vhost_vq_avail_empty() A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by Will. Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M \ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_vq_avail_empty(). When tx_can_batch() returns true, it means there's still pending tx buffers. Since it might read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that it should be safe until vq->avail_idx is changed by commit 275bf960ac697 ("vhost: better detection of available buffers"). Fixes: 275bf960ac69 ("vhost: better detection of available buffers") Cc: # v4.11+ Reported-by: Yihuang Yu Suggested-by: Will Deacon Signed-off-by: Gavin Shan Acked-by: Jason Wang Message-Id: <20240328002149.1141302-2-gshan@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vhost/vhost.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..29df65b2ebf2 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2799,9 +2799,19 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) r = vhost_get_avail_idx(vq, &avail_idx); if (unlikely(r)) return false; - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - return vq->avail_idx == vq->last_avail_idx; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Since we have updated avail_idx, the following + * call to vhost_get_vq_desc() will read available + * ring entries. Make sure that read happens after + * the avail_idx read. + */ + smp_rmb(); + return false; + } + + return true; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); From df9ace7647d4123209395bb9967e998d5758c645 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 28 Mar 2024 10:21:48 +1000 Subject: [PATCH 060/270] vhost: Add smp_rmb() in vhost_enable_notify() A smp_rmb() has been missed in vhost_enable_notify(), inspired by Will. Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M \ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_enable_notify(). When it returns true, it means there's still pending tx buffers. Since it might read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that it should be safe until vq->avail_idx is changed by commit d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()"). Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: # v5.18+ Reported-by: Yihuang Yu Suggested-by: Will Deacon Signed-off-by: Gavin Shan Acked-by: Jason Wang Message-Id: <20240328002149.1141302-3-gshan@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vhost/vhost.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 29df65b2ebf2..32686c79c41d 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2848,9 +2848,19 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) &vq->avail->idx, r); return false; } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - return vq->avail_idx != vq->last_avail_idx; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Since we have updated avail_idx, the following + * call to vhost_get_vq_desc() will read available + * ring entries. Make sure that read happens after + * the avail_idx read. + */ + smp_rmb(); + return true; + } + + return false; } EXPORT_SYMBOL_GPL(vhost_enable_notify); From ffe6176b7f53ca0c99355f13e14a33a40cf49406 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 31 Mar 2024 10:43:48 +0200 Subject: [PATCH 061/270] virtio: store owner from modules with register_virtio_driver() Modules registering driver with register_virtio_driver() might forget to set .owner field. i2c-virtio.c for example has it missing. The field is used by some other kernel parts for reference counting (try_module_get()), so it is expected that drivers will set it. Solve the problem by moving this task away from the drivers to the core virtio code, just like we did for platform_driver in commit 9447057eaff8 ("platform_device: use a macro instead of platform_driver_register"). Fixes: 3cfc88380413 ("i2c: virtio: add a virtio i2c frontend driver") Cc: "Jie Deng" Signed-off-by: Krzysztof Kozlowski Message-Id: <20240331-module-owner-virtio-v2-1-98f04bfaf46a@linaro.org> Signed-off-by: Michael S. Tsirkin --- Documentation/driver-api/virtio/writing_virtio_drivers.rst | 1 - drivers/virtio/virtio.c | 6 ++++-- include/linux/virtio.h | 7 +++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/driver-api/virtio/writing_virtio_drivers.rst b/Documentation/driver-api/virtio/writing_virtio_drivers.rst index e14c58796d25..e5de6f5d061a 100644 --- a/Documentation/driver-api/virtio/writing_virtio_drivers.rst +++ b/Documentation/driver-api/virtio/writing_virtio_drivers.rst @@ -97,7 +97,6 @@ like this:: static struct virtio_driver virtio_dummy_driver = { .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .probe = virtio_dummy_probe, .remove = virtio_dummy_remove, diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index f173587893cb..9510c551dce8 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -362,14 +362,16 @@ static const struct bus_type virtio_bus = { .remove = virtio_dev_remove, }; -int register_virtio_driver(struct virtio_driver *driver) +int __register_virtio_driver(struct virtio_driver *driver, struct module *owner) { /* Catch this early. */ BUG_ON(driver->feature_table_size && !driver->feature_table); driver->driver.bus = &virtio_bus; + driver->driver.owner = owner; + return driver_register(&driver->driver); } -EXPORT_SYMBOL_GPL(register_virtio_driver); +EXPORT_SYMBOL_GPL(__register_virtio_driver); void unregister_virtio_driver(struct virtio_driver *driver) { diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b0201747a263..26c4325aa373 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -170,7 +170,7 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev); /** * struct virtio_driver - operations for a virtio I/O driver - * @driver: underlying device driver (populate name and owner). + * @driver: underlying device driver (populate name). * @id_table: the ids serviced by this driver. * @feature_table: an array of feature numbers supported by this driver. * @feature_table_size: number of entries in the feature table array. @@ -208,7 +208,10 @@ static inline struct virtio_driver *drv_to_virtio(struct device_driver *drv) return container_of(drv, struct virtio_driver, driver); } -int register_virtio_driver(struct virtio_driver *drv); +/* use a macro to avoid include chaining to get THIS_MODULE */ +#define register_virtio_driver(drv) \ + __register_virtio_driver(drv, THIS_MODULE) +int __register_virtio_driver(struct virtio_driver *drv, struct module *owner); void unregister_virtio_driver(struct virtio_driver *drv); /* module_virtio_driver() - Helper macro for drivers that don't do From 2855c2a7820bc8198ae937a9a67dbdc3990e9d2c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 2 Apr 2024 17:21:43 -0400 Subject: [PATCH 062/270] vhost-vdpa: change ioctl # for VDPA_GET_VRING_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VDPA_GET_VRING_SIZE by mistake uses the already occupied ioctl # 0x80 and we never noticed - it happens to work because the direction and size are different, but confuses tools such as perf which like to look at just the number, and breaks the extra robustness of the ioctl numbering macros. To fix, sort the entries and renumber the ioctl - not too late since it wasn't in any released kernels yet. Cc: Arnaldo Carvalho de Melo Reported-by: Namhyung Kim Fixes: 1496c47065f9 ("vhost-vdpa: uapi to support reporting per vq size") Cc: "Zhu Lingshan" Signed-off-by: Michael S. Tsirkin Message-Id: <41c1c5489688abe5bfef9f7cf15584e3fb872ac5.1712092759.git.mst@redhat.com> Reviewed-by: Eugenio Pérez Reviewed-by: Zhu Lingshan Reviewed-by: Stefano Garzarella Acked-by: Jason Wang --- include/uapi/linux/vhost.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index bea697390613..b95dd84eef2d 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -179,12 +179,6 @@ /* Get the config size */ #define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) -/* Get the count of all virtqueues */ -#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) - -/* Get the number of virtqueue groups. */ -#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) - /* Get the number of address spaces. */ #define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) @@ -228,10 +222,17 @@ #define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ struct vhost_vring_state) + +/* Get the count of all virtqueues */ +#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) + +/* Get the number of virtqueue groups. */ +#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) + /* Get the queue size of a specific virtqueue. * userspace set the vring index in vhost_vring_state.index * kernel set the queue size in vhost_vring_state.num */ -#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x80, \ +#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) #endif From 76f408535aab39c33e0a1dcada9fba5631c65595 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Mon, 11 Mar 2024 16:21:09 +0800 Subject: [PATCH 063/270] vhost: correct misleading printing information Guest moved avail idx not used idx when we need to print log if '(vq->avail_idx - last_avail_idx) > vq->num', so fix it. Signed-off-by: Xianting Tian Message-Id: <20240311082109.46773-1-xianting.tian@linux.alibaba.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 32686c79c41d..8995730ce0bf 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2515,7 +2515,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, vq->avail_idx = vhost16_to_cpu(vq, avail_idx); if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved used index from %u to %u", + vq_err(vq, "Guest moved avail index from %u to %u", last_avail_idx, vq->avail_idx); return -EFAULT; } From f0cf7ffcd02953c72fed5995378805883d16203e Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Tue, 2 Apr 2024 12:49:22 +0200 Subject: [PATCH 064/270] accel/ivpu: Check return code of ipc->lock init Return value of drmm_mutex_init(ipc->lock) was unchecked. Fixes: 5d7422cfb498 ("accel/ivpu: Add IPC driver and JSM messages") Cc: # v6.3+ Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-2-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_ipc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_ipc.c b/drivers/accel/ivpu/ivpu_ipc.c index 04ac4b9840fb..56ff067f63e2 100644 --- a/drivers/accel/ivpu/ivpu_ipc.c +++ b/drivers/accel/ivpu/ivpu_ipc.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include @@ -501,7 +501,11 @@ int ivpu_ipc_init(struct ivpu_device *vdev) spin_lock_init(&ipc->cons_lock); INIT_LIST_HEAD(&ipc->cons_list); INIT_LIST_HEAD(&ipc->cb_msg_list); - drmm_mutex_init(&vdev->drm, &ipc->lock); + ret = drmm_mutex_init(&vdev->drm, &ipc->lock); + if (ret) { + ivpu_err(vdev, "Failed to initialize ipc->lock, ret %d\n", ret); + goto err_free_rx; + } ivpu_ipc_reset(vdev); return 0; From e3caadf1f9dfc9d62b5ffc3bd73ebac0c8f26b3f Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:23 +0200 Subject: [PATCH 065/270] accel/ivpu: Remove d3hot_after_power_off WA Always enter D3hot after entering D0i3 an all platforms. This minimizes power usage. Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-3-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 20 ++++++++++---------- drivers/accel/ivpu/ivpu_drv.h | 3 +-- drivers/accel/ivpu/ivpu_hw_37xx.c | 4 +--- drivers/accel/ivpu/ivpu_pm.c | 7 ++----- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 39f6d1b98fd6..303d92753387 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include @@ -387,12 +387,15 @@ int ivpu_shutdown(struct ivpu_device *vdev) { int ret; - ivpu_prepare_for_reset(vdev); + /* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */ + pci_save_state(to_pci_dev(vdev->drm.dev)); ret = ivpu_hw_power_down(vdev); if (ret) ivpu_warn(vdev, "Failed to power down HW: %d\n", ret); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); + return ret; } @@ -560,11 +563,11 @@ static int ivpu_dev_init(struct ivpu_device *vdev) /* Power up early so the rest of init code can access VPU registers */ ret = ivpu_hw_power_up(vdev); if (ret) - goto err_power_down; + goto err_shutdown; ret = ivpu_mmu_global_context_init(vdev); if (ret) - goto err_power_down; + goto err_shutdown; ret = ivpu_mmu_init(vdev); if (ret) @@ -601,10 +604,8 @@ err_mmu_rctx_fini: ivpu_mmu_reserved_context_fini(vdev); err_mmu_gctx_fini: ivpu_mmu_global_context_fini(vdev); -err_power_down: - ivpu_hw_power_down(vdev); - if (IVPU_WA(d3hot_after_power_off)) - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); +err_shutdown: + ivpu_shutdown(vdev); err_xa_destroy: xa_destroy(&vdev->db_xa); xa_destroy(&vdev->submitted_jobs_xa); @@ -628,9 +629,8 @@ static void ivpu_bo_unbind_all_user_contexts(struct ivpu_device *vdev) static void ivpu_dev_fini(struct ivpu_device *vdev) { ivpu_pm_disable(vdev); + ivpu_prepare_for_reset(vdev); ivpu_shutdown(vdev); - if (IVPU_WA(d3hot_after_power_off)) - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); ivpu_jobs_abort_all(vdev); ivpu_job_done_consumer_fini(vdev); diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h index 7be0500d9bb8..bb4374d0eaec 100644 --- a/drivers/accel/ivpu/ivpu_drv.h +++ b/drivers/accel/ivpu/ivpu_drv.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #ifndef __IVPU_DRV_H__ @@ -90,7 +90,6 @@ struct ivpu_wa_table { bool punit_disabled; bool clear_runtime_mem; - bool d3hot_after_power_off; bool interrupt_clear_with_0; bool disable_clock_relinquish; bool disable_d0i3_msg; diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c index 9a0c9498baba..5e2865f9f7d6 100644 --- a/drivers/accel/ivpu/ivpu_hw_37xx.c +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include "ivpu_drv.h" @@ -75,7 +75,6 @@ static void ivpu_hw_wa_init(struct ivpu_device *vdev) { vdev->wa.punit_disabled = false; vdev->wa.clear_runtime_mem = false; - vdev->wa.d3hot_after_power_off = true; REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, BUTTRESS_ALL_IRQ_MASK); if (REGB_RD32(VPU_37XX_BUTTRESS_INTERRUPT_STAT) == BUTTRESS_ALL_IRQ_MASK) { @@ -86,7 +85,6 @@ static void ivpu_hw_wa_init(struct ivpu_device *vdev) IVPU_PRINT_WA(punit_disabled); IVPU_PRINT_WA(clear_runtime_mem); - IVPU_PRINT_WA(d3hot_after_power_off); IVPU_PRINT_WA(interrupt_clear_with_0); } diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 7cce1c928a7f..9cbd7af6576b 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include @@ -58,15 +58,12 @@ static int ivpu_suspend(struct ivpu_device *vdev) { int ret; - /* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */ - pci_save_state(to_pci_dev(vdev->drm.dev)); + ivpu_prepare_for_reset(vdev); ret = ivpu_shutdown(vdev); if (ret) ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret); - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); - return ret; } From 3534eacbf101f6e66105f03d869a03893407c384 Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Tue, 2 Apr 2024 12:49:24 +0200 Subject: [PATCH 066/270] accel/ivpu: Fix PCI D0 state entry in resume In case of failed power up we end up left in PCI D3hot state making it impossible to access NPU registers on retry. Enter D0 state on retry before proceeding with power up sequence. Fixes: 28083ff18d3f ("accel/ivpu: Fix DevTLB errors on suspend/resume and recovery") Cc: # v6.8+ Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-4-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 9cbd7af6576b..325b82f8d971 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -71,10 +71,10 @@ static int ivpu_resume(struct ivpu_device *vdev) { int ret; - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); - pci_restore_state(to_pci_dev(vdev->drm.dev)); - retry: + pci_restore_state(to_pci_dev(vdev->drm.dev)); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); + ret = ivpu_hw_power_up(vdev); if (ret) { ivpu_err(vdev, "Failed to power up HW: %d\n", ret); From 875bc9cd1b33eb027a5663f5e6878a43d98e9a16 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:25 +0200 Subject: [PATCH 067/270] accel/ivpu: Put NPU back to D3hot after failed resume Put NPU in D3hot after ivpu_resume() fails to power up the device. This will assure that D3->D0 power cycle will be performed before the next resume and also will minimize power usage in this corner case. Fixes: 28083ff18d3f ("accel/ivpu: Fix DevTLB errors on suspend/resume and recovery") Cc: # v6.8+ Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-5-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 325b82f8d971..ba51781b5896 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -97,6 +97,7 @@ err_mmu_disable: ivpu_mmu_disable(vdev); err_power_down: ivpu_hw_power_down(vdev); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); if (!ivpu_fw_is_cold_boot(vdev)) { ivpu_pm_prepare_cold_boot(vdev); From 3556f922612caf4c9b97cf7337626f8342b3dea3 Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Tue, 2 Apr 2024 12:49:26 +0200 Subject: [PATCH 068/270] accel/ivpu: Improve clarity of MMU error messages This patch improves readability and clarity of MMU error messages. Previously, the error strings were somewhat confusing and could lead to ambiguous interpretations, making it difficult to diagnose issues. Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-6-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c index 91bd640655ab..2e46b322c450 100644 --- a/drivers/accel/ivpu/ivpu_mmu.c +++ b/drivers/accel/ivpu/ivpu_mmu.c @@ -278,7 +278,7 @@ static const char *ivpu_mmu_event_to_str(u32 cmd) case IVPU_MMU_EVT_F_VMS_FETCH: return "Fetch of VMS caused external abort"; default: - return "Unknown CMDQ command"; + return "Unknown event"; } } @@ -286,15 +286,15 @@ static const char *ivpu_mmu_cmdq_err_to_str(u32 err) { switch (err) { case IVPU_MMU_CERROR_NONE: - return "No CMDQ Error"; + return "No error"; case IVPU_MMU_CERROR_ILL: return "Illegal command"; case IVPU_MMU_CERROR_ABT: - return "External abort on CMDQ read"; + return "External abort on command queue read"; case IVPU_MMU_CERROR_ATC_INV_SYNC: return "Sync failed to complete ATS invalidation"; default: - return "Unknown CMDQ Error"; + return "Unknown error"; } } From c52c35e5b404b95a5bcff39af9be1b9293be3434 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:27 +0200 Subject: [PATCH 069/270] accel/ivpu: Return max freq for DRM_IVPU_PARAM_CORE_CLOCK_RATE DRM_IVPU_PARAM_CORE_CLOCK_RATE returns current NPU frequency which could be 0 if device was sleeping. This value isn't really useful to the user space, so return max freq instead which can be used to estimate NPU performance. Fixes: c39dc15191c4 ("accel/ivpu: Read clock rate only if device is up") Cc: # v6.7 Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-7-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 18 +----------------- drivers/accel/ivpu/ivpu_hw.h | 6 ++++++ drivers/accel/ivpu/ivpu_hw_37xx.c | 7 ++++--- drivers/accel/ivpu/ivpu_hw_40xx.c | 6 ++++++ 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 303d92753387..77283daaedd1 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -131,22 +131,6 @@ static int ivpu_get_capabilities(struct ivpu_device *vdev, struct drm_ivpu_param return 0; } -static int ivpu_get_core_clock_rate(struct ivpu_device *vdev, u64 *clk_rate) -{ - int ret; - - ret = ivpu_rpm_get_if_active(vdev); - if (ret < 0) - return ret; - - *clk_rate = ret ? ivpu_hw_reg_pll_freq_get(vdev) : 0; - - if (ret) - ivpu_rpm_put(vdev); - - return 0; -} - static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct ivpu_file_priv *file_priv = file->driver_priv; @@ -170,7 +154,7 @@ static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_f args->value = vdev->platform; break; case DRM_IVPU_PARAM_CORE_CLOCK_RATE: - ret = ivpu_get_core_clock_rate(vdev, &args->value); + args->value = ivpu_hw_ratio_to_freq(vdev, vdev->hw->pll.max_ratio); break; case DRM_IVPU_PARAM_NUM_CONTEXTS: args->value = ivpu_get_context_count(vdev); diff --git a/drivers/accel/ivpu/ivpu_hw.h b/drivers/accel/ivpu/ivpu_hw.h index b2909168a0a6..094c659d2800 100644 --- a/drivers/accel/ivpu/ivpu_hw.h +++ b/drivers/accel/ivpu/ivpu_hw.h @@ -21,6 +21,7 @@ struct ivpu_hw_ops { u32 (*profiling_freq_get)(struct ivpu_device *vdev); void (*profiling_freq_drive)(struct ivpu_device *vdev, bool enable); u32 (*reg_pll_freq_get)(struct ivpu_device *vdev); + u32 (*ratio_to_freq)(struct ivpu_device *vdev, u32 ratio); u32 (*reg_telemetry_offset_get)(struct ivpu_device *vdev); u32 (*reg_telemetry_size_get)(struct ivpu_device *vdev); u32 (*reg_telemetry_enable_get)(struct ivpu_device *vdev); @@ -130,6 +131,11 @@ static inline u32 ivpu_hw_reg_pll_freq_get(struct ivpu_device *vdev) return vdev->hw->ops->reg_pll_freq_get(vdev); }; +static inline u32 ivpu_hw_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) +{ + return vdev->hw->ops->ratio_to_freq(vdev, ratio); +} + static inline u32 ivpu_hw_reg_telemetry_offset_get(struct ivpu_device *vdev) { return vdev->hw->ops->reg_telemetry_offset_get(vdev); diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c index 5e2865f9f7d6..bd25e2d9fb0f 100644 --- a/drivers/accel/ivpu/ivpu_hw_37xx.c +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c @@ -803,12 +803,12 @@ static void ivpu_hw_37xx_profiling_freq_drive(struct ivpu_device *vdev, bool ena /* Profiling freq - is a debug feature. Unavailable on VPU 37XX. */ } -static u32 ivpu_hw_37xx_pll_to_freq(u32 ratio, u32 config) +static u32 ivpu_hw_37xx_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) { u32 pll_clock = PLL_REF_CLK_FREQ * ratio; u32 cpu_clock; - if ((config & 0xff) == PLL_RATIO_4_3) + if ((vdev->hw->config & 0xff) == PLL_RATIO_4_3) cpu_clock = pll_clock * 2 / 4; else cpu_clock = pll_clock * 2 / 5; @@ -827,7 +827,7 @@ static u32 ivpu_hw_37xx_reg_pll_freq_get(struct ivpu_device *vdev) if (!ivpu_is_silicon(vdev)) return PLL_SIMULATION_FREQ; - return ivpu_hw_37xx_pll_to_freq(pll_curr_ratio, vdev->hw->config); + return ivpu_hw_37xx_ratio_to_freq(vdev, pll_curr_ratio); } static u32 ivpu_hw_37xx_reg_telemetry_offset_get(struct ivpu_device *vdev) @@ -1050,6 +1050,7 @@ const struct ivpu_hw_ops ivpu_hw_37xx_ops = { .profiling_freq_get = ivpu_hw_37xx_profiling_freq_get, .profiling_freq_drive = ivpu_hw_37xx_profiling_freq_drive, .reg_pll_freq_get = ivpu_hw_37xx_reg_pll_freq_get, + .ratio_to_freq = ivpu_hw_37xx_ratio_to_freq, .reg_telemetry_offset_get = ivpu_hw_37xx_reg_telemetry_offset_get, .reg_telemetry_size_get = ivpu_hw_37xx_reg_telemetry_size_get, .reg_telemetry_enable_get = ivpu_hw_37xx_reg_telemetry_enable_get, diff --git a/drivers/accel/ivpu/ivpu_hw_40xx.c b/drivers/accel/ivpu/ivpu_hw_40xx.c index e4eddbf5d11c..b0b88d4c8926 100644 --- a/drivers/accel/ivpu/ivpu_hw_40xx.c +++ b/drivers/accel/ivpu/ivpu_hw_40xx.c @@ -980,6 +980,11 @@ static u32 ivpu_hw_40xx_reg_pll_freq_get(struct ivpu_device *vdev) return PLL_RATIO_TO_FREQ(pll_curr_ratio); } +static u32 ivpu_hw_40xx_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) +{ + return PLL_RATIO_TO_FREQ(ratio); +} + static u32 ivpu_hw_40xx_reg_telemetry_offset_get(struct ivpu_device *vdev) { return REGB_RD32(VPU_40XX_BUTTRESS_VPU_TELEMETRY_OFFSET); @@ -1230,6 +1235,7 @@ const struct ivpu_hw_ops ivpu_hw_40xx_ops = { .profiling_freq_get = ivpu_hw_40xx_profiling_freq_get, .profiling_freq_drive = ivpu_hw_40xx_profiling_freq_drive, .reg_pll_freq_get = ivpu_hw_40xx_reg_pll_freq_get, + .ratio_to_freq = ivpu_hw_40xx_ratio_to_freq, .reg_telemetry_offset_get = ivpu_hw_40xx_reg_telemetry_offset_get, .reg_telemetry_size_get = ivpu_hw_40xx_reg_telemetry_size_get, .reg_telemetry_enable_get = ivpu_hw_40xx_reg_telemetry_enable_get, From 0d298e23292b7a5b58c5589fe33b96e95363214f Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:28 +0200 Subject: [PATCH 070/270] accel/ivpu: Fix missed error message after VPU rename Change "VPU" to "NPU" in ivpu_suspend() so it matches all other error messages. Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-8-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index ba51781b5896..4f5ea466731f 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -62,7 +62,7 @@ static int ivpu_suspend(struct ivpu_device *vdev) ret = ivpu_shutdown(vdev); if (ret) - ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret); + ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret); return ret; } From fd7726e75968b27fe98534ccbf47ccd6fef686f3 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:29 +0200 Subject: [PATCH 071/270] accel/ivpu: Fix deadlock in context_xa ivpu_device->context_xa is locked both in kernel thread and IRQ context. It requires XA_FLAGS_LOCK_IRQ flag to be passed during initialization otherwise the lock could be acquired from a thread and interrupted by an IRQ that locks it for the second time causing the deadlock. This deadlock was reported by lockdep and observed in internal tests. Fixes: 35b137630f08 ("accel/ivpu: Introduce a new DRM driver for Intel VPU") Cc: # v6.3+ Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-9-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 77283daaedd1..51d3f1a55d02 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -517,7 +517,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev) vdev->context_xa_limit.min = IVPU_USER_CONTEXT_MIN_SSID; vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID; atomic64_set(&vdev->unique_id_counter, 0); - xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC); + xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1); xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1); lockdep_set_class(&vdev->submitted_jobs_xa.xa_lock, &submitted_jobs_xa_lock_class_key); From 3c89a068bfd0698a5478f4cf39493595ef757d5e Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 8 Apr 2024 09:02:23 +0200 Subject: [PATCH 072/270] PM: s2idle: Make sure CPUs will wakeup directly on resume s2idle works like a regular suspend with freezing processes and freezing devices. All CPUs except the control CPU go into idle. Once this is completed the control CPU kicks all other CPUs out of idle, so that they reenter the idle loop and then enter s2idle state. The control CPU then issues an swait() on the suspend state and therefore enters the idle loop as well. Due to being kicked out of idle, the other CPUs leave their NOHZ states, which means the tick is active and the corresponding hrtimer is programmed to the next jiffie. On entering s2idle the CPUs shut down their local clockevent device to prevent wakeups. The last CPU which enters s2idle shuts down its local clockevent and freezes timekeeping. On resume, one of the CPUs receives the wakeup interrupt, unfreezes timekeeping and its local clockevent and starts the resume process. At that point all other CPUs are still in s2idle with their clockevents switched off. They only resume when they are kicked by another CPU or after resuming devices and then receiving a device interrupt. That means there is no guarantee that all CPUs will wakeup directly on resume. As a consequence there is no guarantee that timers which are queued on those CPUs and should expire directly after resume, are handled. Also timer list timers which are remotely queued to one of those CPUs after resume will not result in a reprogramming IPI as the tick is active. Queueing a hrtimer will also not result in a reprogramming IPI because the first hrtimer event is already in the past. The recent introduction of the timer pull model (7ee988770326 ("timers: Implement the hierarchical pull model")) amplifies this problem, if the current migrator is one of the non woken up CPUs. When a non pinned timer list timer is queued and the queuing CPU goes idle, it relies on the still suspended migrator CPU to expire the timer which will happen by chance. The problem exists since commit 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to-idle path"). There the cpuidle_pause() call which in turn invoked a wakeup for all idle CPUs was moved to a later point in the resume process. This might not be reached or reached very late because it waits on a timer of a still suspended CPU. Address this by kicking all CPUs out of idle after the control CPU returns from swait() so that they resume their timers and restore consistent system state. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218641 Fixes: 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to-idle path") Signed-off-by: Anna-Maria Behnsen Reviewed-by: Thomas Gleixner Tested-by: Mario Limonciello Cc: 5.16+ # 5.16+ Acked-by: Peter Zijlstra (Intel) Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index e3ae93bbcb9b..09f8397bae15 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -106,6 +106,12 @@ static void s2idle_enter(void) swait_event_exclusive(s2idle_wait_head, s2idle_state == S2IDLE_STATE_WAKE); + /* + * Kick all CPUs to ensure that they resume their timers and restore + * consistent system state. + */ + wake_up_all_idle_cpus(); + cpus_read_unlock(); raw_spin_lock_irq(&s2idle_lock); From 5ce344beaca688f4cdea07045e0b8f03dc537e74 Mon Sep 17 00:00:00 2001 From: Adam Dunlap Date: Mon, 18 Mar 2024 16:09:27 -0700 Subject: [PATCH 073/270] x86/apic: Force native_apic_mem_read() to use the MOV instruction When done from a virtual machine, instructions that touch APIC memory must be emulated. By convention, MMIO accesses are typically performed via io.h helpers such as readl() or writeq() to simplify instruction emulation/decoding (ex: in KVM hosts and SEV guests) [0]. Currently, native_apic_mem_read() does not follow this convention, allowing the compiler to emit instructions other than the MOV instruction generated by readl(). In particular, when the kernel is compiled with clang and run as a SEV-ES or SEV-SNP guest, the compiler would emit a TESTL instruction which is not supported by the SEV-ES emulator, causing a boot failure in that environment. It is likely the same problem would happen in a TDX guest as that uses the same instruction emulator as SEV-ES. To make sure all emulators can emulate APIC memory reads via MOV, use the readl() function in native_apic_mem_read(). It is expected that any emulator would support MOV in any addressing mode as it is the most generic and is what is usually emitted currently. The TESTL instruction is emitted when native_apic_mem_read() is inlined into apic_mem_wait_icr_idle(). The emulator comes from insn_decode_mmio() in arch/x86/lib/insn-eval.c. It's not worth it to extend insn_decode_mmio() to support more instructions since, in theory, the compiler could choose to output nearly any instruction for such reads which would bloat the emulator beyond reason. [0] https://lore.kernel.org/all/20220405232939.73860-12-kirill.shutemov@linux.intel.com/ [ bp: Massage commit message, fix typos. ] Signed-off-by: Adam Dunlap Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Thomas Gleixner Reviewed-by: Ard Biesheuvel Tested-by: Kevin Loughlin Cc: Link: https://lore.kernel.org/r/20240318230927.2191933-1-acdunlap@google.com --- arch/x86/include/asm/apic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 94ce0f7c9d3a..e6ab0cf15ed5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -13,6 +13,7 @@ #include #include #include +#include #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -98,7 +99,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) static inline u32 native_apic_mem_read(u32 reg) { - return *((volatile u32 *)(APIC_BASE + reg)); + return readl((void __iomem *)(APIC_BASE + reg)); } static inline void native_apic_mem_eoi(void) From c1d11fc2c8320871b40730991071dd0a0b405bc8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:46:01 +0200 Subject: [PATCH 074/270] irqflags: Explicitly ignore lockdep_hrtimer_exit() argument When building with 'make W=1' but CONFIG_TRACE_IRQFLAGS=n, the unused argument to lockdep_hrtimer_exit() causes a warning: kernel/time/hrtimer.c:1655:14: error: variable 'expires_in_hardirq' set but not used [-Werror=unused-but-set-variable] This is intentional behavior, so add a cast to void to shut up the warning. Fixes: 73d20564e0dc ("hrtimer: Don't dereference the hrtimer pointer after the callback") Reported-by: kernel test robot Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240408074609.3170807-1-arnd@kernel.org Closes: https://lore.kernel.org/oe-kbuild-all/202311191229.55QXHVc6-lkp@intel.com/ --- include/linux/irqflags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 147feebd508c..3f003d5fde53 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -114,7 +114,7 @@ do { \ # define lockdep_softirq_enter() do { } while (0) # define lockdep_softirq_exit() do { } while (0) # define lockdep_hrtimer_enter(__hrtimer) false -# define lockdep_hrtimer_exit(__context) do { } while (0) +# define lockdep_hrtimer_exit(__context) do { (void)(__context); } while (0) # define lockdep_posixtimer_enter() do { } while (0) # define lockdep_posixtimer_exit() do { } while (0) # define lockdep_irq_work_enter(__work) do { } while (0) From fa1f51162338b3e2f520d4bfedc42b3b2e00da6d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 19 Mar 2024 19:20:50 +0100 Subject: [PATCH 075/270] locking: Make rwsem_assert_held_write_nolockdep() build with PREEMPT_RT=y The commit cited below broke the build for PREEMPT_RT because rwsem_assert_held_write_nolockdep() passes a struct rw_semaphore but rw_base_assert_held_write() expects struct rwbase_rt. Fixing the type alone leads to the problem that WARN_ON() is not found because bug.h is missing. In order to resolve this: - Keep the assert (WARN_ON()) in rwsem.h (not rwbase_rt.h) - Make rwsem_assert_held_write_nolockdep() do the implementation specific (rw_base) writer check. - Replace the "inline" with __always_inline which was used before. Fixes: f70405afc99b1 ("locking: Add rwsem_assert_held() and rwsem_assert_held_write()") Reported-by: Clark Williams Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Waiman Long Link: https://lore.kernel.org/r/20240319182050.U4AzUF3I@linutronix.de --- include/linux/rwbase_rt.h | 4 ++-- include/linux/rwsem.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/rwbase_rt.h b/include/linux/rwbase_rt.h index 29c4e4f243e4..f2394a409c9d 100644 --- a/include/linux/rwbase_rt.h +++ b/include/linux/rwbase_rt.h @@ -31,9 +31,9 @@ static __always_inline bool rw_base_is_locked(const struct rwbase_rt *rwb) return atomic_read(&rwb->readers) != READER_BIAS; } -static inline void rw_base_assert_held_write(const struct rwbase_rt *rwb) +static __always_inline bool rw_base_is_write_locked(const struct rwbase_rt *rwb) { - WARN_ON(atomic_read(&rwb->readers) != WRITER_BIAS); + return atomic_read(&rwb->readers) == WRITER_BIAS; } static __always_inline bool rw_base_is_contended(const struct rwbase_rt *rwb) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 4f1c18992f76..c8b543d428b0 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -167,14 +167,14 @@ static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) return rw_base_is_locked(&sem->rwbase); } -static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rwsem_is_locked(sem)); } -static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { - rw_base_assert_held_write(sem); + WARN_ON(!rw_base_is_write_locked(&sem->rwbase)); } static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) From d730192ff0246356a2d7e63ff5bd501060670eec Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 6 Apr 2024 13:40:52 +0200 Subject: [PATCH 076/270] ACPI: scan: Do not increase dep_unmet for already met dependencies On the Toshiba Encore WT10-A tablet the BATC battery ACPI device depends on 3 other devices: Name (_DEP, Package (0x03) // _DEP: Dependencies { I2C1, GPO2, GPO0 }) acpi_scan_check_dep() adds all 3 of these to the acpi_dep_list and then before an acpi_device is created for the BATC handle (and thus before acpi_scan_dep_init() runs) acpi_scan_clear_dep() gets called for both GPIO depenencies, with free_when_met not set for the dependencies. Since there is no adev for BATC yet, there also is no dep_unmet to decrement. The only result of acpi_scan_clear_dep() in this case is dep->met getting set. Soon after acpi_scan_clear_dep() has been called for the GPIO dependencies the acpi_device gets created for the BATC handle and acpi_scan_dep_init() runs, this sees 3 dependencies on the acpi_dep_list and initializes unmet_dep to 3. Later when the dependency for I2C1 is met unmet_dep becomes 2, but since the 2 GPIO deps where already met it never becomes 0 causing battery monitoring to not work. Fix this by modifying acpi_scan_dep_init() to not increase dep_met for dependencies which have already been marked as being met. Fixes: 3ba12d8de3fa ("ACPI: scan: Reduce overhead related to devices with dependencies") Signed-off-by: Hans de Goede Cc: 6.5+ # 6.5+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 7c157bf92695..d1464324de95 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1843,7 +1843,8 @@ static void acpi_scan_dep_init(struct acpi_device *adev) if (dep->honor_dep) adev->flags.honor_deps = 1; - adev->dep_unmet++; + if (!dep->met) + adev->dep_unmet++; } } } From 8ab58f6841b19423231c5db3378691ec80c778f8 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 14 Mar 2024 16:49:43 +0100 Subject: [PATCH 077/270] gpu: host1x: Do not setup DMA for virtual devices The host1x devices are virtual compound devices and do not perform DMA accesses themselves, so they do not need to be set up for DMA. Ideally we would also not need to set up DMA masks for the virtual devices, but we currently still need those for legacy support on old hardware. Tested-by: Jon Hunter Acked-by: Jon Hunter Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20240314154943.2487549-1-thierry.reding@gmail.com --- drivers/gpu/host1x/bus.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c index 783975d1384f..7c52757a89db 100644 --- a/drivers/gpu/host1x/bus.c +++ b/drivers/gpu/host1x/bus.c @@ -351,11 +351,6 @@ static int host1x_device_uevent(const struct device *dev, return 0; } -static int host1x_dma_configure(struct device *dev) -{ - return of_dma_configure(dev, dev->of_node, true); -} - static const struct dev_pm_ops host1x_device_pm_ops = { .suspend = pm_generic_suspend, .resume = pm_generic_resume, @@ -369,7 +364,6 @@ const struct bus_type host1x_bus_type = { .name = "host1x", .match = host1x_device_match, .uevent = host1x_device_uevent, - .dma_configure = host1x_dma_configure, .pm = &host1x_device_pm_ops, }; @@ -458,8 +452,6 @@ static int host1x_device_add(struct host1x *host1x, device->dev.bus = &host1x_bus_type; device->dev.parent = host1x->dev; - of_dma_configure(&device->dev, host1x->dev->of_node, true); - device->dev.dma_parms = &device->dma_parms; dma_set_max_seg_size(&device->dev, UINT_MAX); From aca1a5287ea328fd1f7e2bfa6806646486d86a70 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 28 Mar 2024 09:25:40 +0530 Subject: [PATCH 078/270] ACPI: bus: allow _UID matching for integer zero Commit b2b32a173881 ("ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types") added _UID matching support for both integer and string types, which satisfies NULL @uid2 argument for string types using inversion, but this logic prevents _UID comparision in case the argument is integer 0, which may result in false positives. Fix this using _Generic(), which will allow NULL @uid2 argument for string types as well as _UID matching for all possible integer values. Fixes: b2b32a173881 ("ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types") Signed-off-by: Raag Jadav [ rjw: Comment adjustment ] Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 5de954e2b18a..e7796f373d0d 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -911,17 +911,19 @@ static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2) * acpi_dev_hid_uid_match - Match device by supplied HID and UID * @adev: ACPI device to match. * @hid2: Hardware ID of the device. - * @uid2: Unique ID of the device, pass 0 or NULL to not check _UID. + * @uid2: Unique ID of the device, pass NULL to not check _UID. * * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2 * will be treated as a match. If user wants to validate @uid2, it should be * done before calling this function. * - * Returns: %true if matches or @uid2 is 0 or NULL, %false otherwise. + * Returns: %true if matches or @uid2 is NULL, %false otherwise. */ #define acpi_dev_hid_uid_match(adev, hid2, uid2) \ (acpi_dev_hid_match(adev, hid2) && \ - (!(uid2) || acpi_dev_uid_match(adev, uid2))) + /* Distinguish integer 0 from NULL @uid2 */ \ + (_Generic(uid2, ACPI_STR_TYPES(!(uid2)), default: 0) || \ + acpi_dev_uid_match(adev, uid2))) void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); From 3eadd887dbac1df8f25f701e5d404d1b90fd0fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 4 Apr 2024 23:33:25 +0300 Subject: [PATCH 079/270] drm/client: Fully protect modes[] with dev->mode_config.mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The modes[] array contains pointers to modes on the connectors' mode lists, which are protected by dev->mode_config.mutex. Thus we need to extend modes[] the same protection or by the time we use it the elements may already be pointing to freed/reused memory. Cc: stable@vger.kernel.org Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10583 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240404203336.10454-2-ville.syrjala@linux.intel.com Reviewed-by: Dmitry Baryshkov Reviewed-by: Jani Nikula Reviewed-by: Thomas Zimmermann --- drivers/gpu/drm/drm_client_modeset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c index 871e4e2129d6..0683a129b362 100644 --- a/drivers/gpu/drm/drm_client_modeset.c +++ b/drivers/gpu/drm/drm_client_modeset.c @@ -777,6 +777,7 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int total_modes_count = 0; struct drm_client_offset *offsets; unsigned int connector_count = 0; + /* points to modes protected by mode_config.mutex */ struct drm_display_mode **modes; struct drm_crtc **crtcs; int i, ret = 0; @@ -845,7 +846,6 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, 0, width, height); } - mutex_unlock(&dev->mode_config.mutex); drm_client_modeset_release(client); @@ -875,6 +875,7 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, modeset->y = offset->y; } } + mutex_unlock(&dev->mode_config.mutex); mutex_unlock(&client->modeset_mutex); out: From 592780b8391fe31f129ef4823c1513528f4dcb76 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:13 -0700 Subject: [PATCH 080/270] cxl: Fix retrieving of access_coordinates in PCIe path Current loop in cxl_endpoint_get_perf_coordinates() incorrectly assumes the Root Port (RP) dport is the one with generic port access_coordinate. However those coordinates are one level up in the Host Bridge (HB). Current code causes the computation code to pick up 0s as the coordinates and cause minimal bandwidth to result in 0. Add check to skip RP when combining coordinates. Fixes: 14a6960b3e92 ("cxl: Add helper function that calculate performance data for downstream ports") Reported-by: Jonathan Cameron Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 6cbde50a742b..7aadcec4fc64 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2165,6 +2165,11 @@ int cxl_hb_get_perf_coordinates(struct cxl_port *port, return 0; } +static bool parent_port_is_cxl_root(struct cxl_port *port) +{ + return is_cxl_root(to_cxl_port(port->dev.parent)); +} + /** * cxl_endpoint_get_perf_coordinates - Retrieve performance numbers stored in dports * of CXL path @@ -2184,27 +2189,31 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct cxl_dport *dport; struct pci_dev *pdev; unsigned int bw; + bool is_cxl_root; if (!is_cxl_endpoint(port)) return -EINVAL; - dport = iter->parent_dport; - /* - * Exit the loop when the parent port of the current port is cxl root. - * The iterative loop starts at the endpoint and gathers the - * latency of the CXL link from the current iter to the next downstream - * port each iteration. If the parent is cxl root then there is - * nothing to gather. + * Exit the loop when the parent port of the current iter port is cxl + * root. The iterative loop starts at the endpoint and gathers the + * latency of the CXL link from the current device/port to the connected + * downstream port each iteration. */ - while (!is_cxl_root(to_cxl_port(iter->dev.parent))) { - cxl_coordinates_combine(&c, &c, &dport->sw_coord); + do { + dport = iter->parent_dport; + iter = to_cxl_port(iter->dev.parent); + is_cxl_root = parent_port_is_cxl_root(iter); + + /* + * There's no valid access_coordinate for a root port since RPs do not + * have CDAT and therefore needs to be skipped. + */ + if (!is_cxl_root) + cxl_coordinates_combine(&c, &c, &dport->sw_coord); c.write_latency += dport->link_latency; c.read_latency += dport->link_latency; - - iter = to_cxl_port(iter->dev.parent); - dport = iter->parent_dport; - } + } while (!is_cxl_root); /* Get the calculated PCI paths bandwidth */ pdev = to_pci_dev(port->uport_dev->parent); From 51293c565cf4b8d57c154efadb57b17866c74bcb Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:14 -0700 Subject: [PATCH 081/270] cxl: Fix incorrect region perf data calculation Current math in cxl_region_perf_data_calculate divides the latency by 1000 every time the function gets called. This causes the region latency to be divided by 1000 per memory device and the math is incorrect. This is user visible as the latency access_coordinate exposed via sysfs will show incorrect latency data. Normalize values from CDAT to nanoseconds. Adjust sub-nanoseconds latency to at least 1. Remove adjustment of perf numbers from the generic target since hmat handling code has already normalized those numbers. Now all computation and stored numbers should be in nanoseconds. cxl_hb_get_perf_coordinates() is removed and HB coords are calculated in the port access_coordinate calculation path since it no longer need to be treated special. Fixes: 3d9f4a197230 ("cxl/region: Calculate performance data for a region") Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-4-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 13 +----- drivers/cxl/core/cdat.c | 94 ++++++++++++++++++----------------------- drivers/cxl/core/port.c | 36 ++-------------- drivers/cxl/cxl.h | 2 - 4 files changed, 45 insertions(+), 100 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index af5cb818f84d..566c387d4385 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -525,22 +525,11 @@ static int get_genport_coordinates(struct device *dev, struct cxl_dport *dport) { struct acpi_device *hb = to_cxl_host_bridge(NULL, dev); u32 uid; - int rc; if (kstrtou32(acpi_device_uid(hb), 0, &uid)) return -EINVAL; - rc = acpi_get_genport_coordinates(uid, dport->hb_coord); - if (rc < 0) - return rc; - - /* Adjust back to picoseconds from nanoseconds */ - for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { - dport->hb_coord[i].read_latency *= 1000; - dport->hb_coord[i].write_latency *= 1000; - } - - return 0; + return acpi_get_genport_coordinates(uid, dport->hb_coord); } static int add_host_bridge_dport(struct device *match, void *arg) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index eddbbe21450c..f66b493b5d3c 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -20,6 +20,36 @@ struct dsmas_entry { int qos_class; }; +static u32 cdat_normalize(u16 entry, u64 base, u8 type) +{ + u32 value; + + /* + * Check for invalid and overflow values + */ + if (entry == 0xffff || !entry) + return 0; + else if (base > (UINT_MAX / (entry))) + return 0; + + /* + * CDAT fields follow the format of HMAT fields. See table 5 Device + * Scoped Latency and Bandwidth Information Structure in Coherent Device + * Attribute Table (CDAT) Specification v1.01. + */ + value = entry * base; + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + case ACPI_HMAT_READ_LATENCY: + case ACPI_HMAT_WRITE_LATENCY: + value = DIV_ROUND_UP(value, 1000); + break; + default: + break; + } + return value; +} + static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, const unsigned long end) { @@ -97,7 +127,6 @@ static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, __le16 le_val; u64 val; u16 len; - int rc; len = le16_to_cpu((__force __le16)hdr->length); if (len != size || (unsigned long)hdr + len > end) { @@ -124,10 +153,8 @@ static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, le_base = (__force __le64)dslbis->entry_base_unit; le_val = (__force __le16)dslbis->entry[0]; - rc = check_mul_overflow(le64_to_cpu(le_base), - le16_to_cpu(le_val), &val); - if (rc) - pr_warn("DSLBIS value overflowed.\n"); + val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), + dslbis->data_type); cxl_access_coordinate_set(&dent->coord, dslbis->data_type, val); @@ -164,7 +191,6 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, struct xarray *dsmas_xa) { struct access_coordinate ep_c; - struct access_coordinate coord[ACCESS_COORDINATE_MAX]; struct dsmas_entry *dent; int valid_entries = 0; unsigned long index; @@ -176,12 +202,6 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, return rc; } - rc = cxl_hb_get_perf_coordinates(port, coord); - if (rc) { - dev_dbg(&port->dev, "Failed to retrieve hb perf coordinates.\n"); - return rc; - } - struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); if (!cxl_root) @@ -194,18 +214,9 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, int qos_class; cxl_coordinates_combine(&dent->coord, &dent->coord, &ep_c); - /* - * Keeping the host bridge coordinates separate from the dsmas - * coordinates in order to allow calculation of access class - * 0 and 1 for region later. - */ - cxl_coordinates_combine(&coord[ACCESS_COORDINATE_CPU], - &coord[ACCESS_COORDINATE_CPU], - &dent->coord); dent->entries = 1; - rc = cxl_root->ops->qos_class(cxl_root, - &coord[ACCESS_COORDINATE_CPU], - 1, &qos_class); + rc = cxl_root->ops->qos_class(cxl_root, &dent->coord, 1, + &qos_class); if (rc != 1) continue; @@ -461,10 +472,8 @@ static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, le_base = (__force __le64)tbl->sslbis_header.entry_base_unit; le_val = (__force __le16)tbl->entries[i].latency_or_bandwidth; - - if (check_mul_overflow(le64_to_cpu(le_base), - le16_to_cpu(le_val), &val)) - dev_warn(dev, "SSLBIS value overflowed!\n"); + val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), + sslbis->data_type); xa_for_each(&port->dports, index, dport) { if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || @@ -521,17 +530,13 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlmd->endpoint; struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - struct access_coordinate hb_coord[ACCESS_COORDINATE_MAX]; - struct access_coordinate coord; struct range dpa = { .start = cxled->dpa_res->start, .end = cxled->dpa_res->end, }; struct cxl_dpa_perf *perf; - int rc; switch (cxlr->mode) { case CXL_DECODER_RAM: @@ -549,35 +554,16 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, if (!range_contains(&perf->dpa_range, &dpa)) return; - rc = cxl_hb_get_perf_coordinates(port, hb_coord); - if (rc) { - dev_dbg(&port->dev, "Failed to retrieve hb perf coordinates.\n"); - return; - } - for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { - /* Pickup the host bridge coords */ - cxl_coordinates_combine(&coord, &hb_coord[i], &perf->coord); - /* Get total bandwidth and the worst latency for the cxl region */ cxlr->coord[i].read_latency = max_t(unsigned int, cxlr->coord[i].read_latency, - coord.read_latency); + perf->coord.read_latency); cxlr->coord[i].write_latency = max_t(unsigned int, cxlr->coord[i].write_latency, - coord.write_latency); - cxlr->coord[i].read_bandwidth += coord.read_bandwidth; - cxlr->coord[i].write_bandwidth += coord.write_bandwidth; - - /* - * Convert latency to nanosec from picosec to be consistent - * with the resulting latency coordinates computed by the - * HMAT_REPORTING code. - */ - cxlr->coord[i].read_latency = - DIV_ROUND_UP(cxlr->coord[i].read_latency, 1000); - cxlr->coord[i].write_latency = - DIV_ROUND_UP(cxlr->coord[i].write_latency, 1000); + perf->coord.write_latency); + cxlr->coord[i].read_bandwidth += perf->coord.read_bandwidth; + cxlr->coord[i].write_bandwidth += perf->coord.write_bandwidth; } } diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 7aadcec4fc64..c7c00eb373af 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2133,38 +2133,6 @@ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) } EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL); -/** - * cxl_hb_get_perf_coordinates - Retrieve performance numbers between initiator - * and host bridge - * - * @port: endpoint cxl_port - * @coord: output access coordinates - * - * Return: errno on failure, 0 on success. - */ -int cxl_hb_get_perf_coordinates(struct cxl_port *port, - struct access_coordinate *coord) -{ - struct cxl_port *iter = port; - struct cxl_dport *dport; - - if (!is_cxl_endpoint(port)) - return -EINVAL; - - dport = iter->parent_dport; - while (iter && !is_cxl_root(to_cxl_port(iter->dev.parent))) { - iter = to_cxl_port(iter->dev.parent); - dport = iter->parent_dport; - } - - coord[ACCESS_COORDINATE_LOCAL] = - dport->hb_coord[ACCESS_COORDINATE_LOCAL]; - coord[ACCESS_COORDINATE_CPU] = - dport->hb_coord[ACCESS_COORDINATE_CPU]; - - return 0; -} - static bool parent_port_is_cxl_root(struct cxl_port *port) { return is_cxl_root(to_cxl_port(port->dev.parent)); @@ -2215,6 +2183,10 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, c.read_latency += dport->link_latency; } while (!is_cxl_root); + dport = iter->parent_dport; + /* Retrieve HB coords */ + cxl_coordinates_combine(&c, &c, dport->hb_coord); + /* Get the calculated PCI paths bandwidth */ pdev = to_pci_dev(port->uport_dev->parent); bw = pcie_bandwidth_available(pdev, NULL, NULL, NULL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 534e25e2f0a4..ed02373ce3d9 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -884,8 +884,6 @@ void cxl_switch_parse_cdat(struct cxl_port *port); int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct access_coordinate *coord); -int cxl_hb_get_perf_coordinates(struct cxl_port *port, - struct access_coordinate *coord); void cxl_region_perf_data_calculate(struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled); From 001c5d19341a39cb683ab0a18ce4b662a09d96a0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:15 -0700 Subject: [PATCH 082/270] cxl: Consolidate dport access_coordinate ->hb_coord and ->sw_coord into ->coord The driver stores access_coordinate for host bridge in ->hb_coord and switch CDAT access_coordinate in ->sw_coord. Since neither of these access_coordinate clobber each other, the variable name can be consolidated into ->coord to simplify the code. Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-5-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 2 +- drivers/cxl/core/cdat.c | 74 +++++++++++++++++++++++------------- drivers/cxl/core/port.c | 48 +++++++++++++++++------ drivers/cxl/cxl.h | 6 +-- drivers/cxl/cxlmem.h | 2 +- tools/testing/cxl/test/cxl.c | 10 +++-- 6 files changed, 94 insertions(+), 48 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 566c387d4385..cb8c155a2c9b 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -529,7 +529,7 @@ static int get_genport_coordinates(struct device *dev, struct cxl_dport *dport) if (kstrtou32(acpi_device_uid(hb), 0, &uid)) return -EINVAL; - return acpi_get_genport_coordinates(uid, dport->hb_coord); + return acpi_get_genport_coordinates(uid, dport->coord); } static int add_host_bridge_dport(struct device *match, void *arg) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index f66b493b5d3c..bb83867d9fec 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -14,7 +14,7 @@ struct dsmas_entry { struct range dpa_range; u8 handle; - struct access_coordinate coord; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; int entries; int qos_class; @@ -88,8 +88,8 @@ static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, return 0; } -static void cxl_access_coordinate_set(struct access_coordinate *coord, - int access, unsigned int val) +static void __cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) { switch (access) { case ACPI_HMAT_ACCESS_LATENCY: @@ -115,6 +115,13 @@ static void cxl_access_coordinate_set(struct access_coordinate *coord, } } +static void cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + __cxl_access_coordinate_set(&coord[i], access, val); +} + static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, const unsigned long end) { @@ -156,7 +163,7 @@ static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), dslbis->data_type); - cxl_access_coordinate_set(&dent->coord, dslbis->data_type, val); + cxl_access_coordinate_set(dent->coord, dslbis->data_type, val); return 0; } @@ -190,13 +197,13 @@ static int cxl_cdat_endpoint_process(struct cxl_port *port, static int cxl_port_perf_data_calculate(struct cxl_port *port, struct xarray *dsmas_xa) { - struct access_coordinate ep_c; + struct access_coordinate ep_c[ACCESS_COORDINATE_MAX]; struct dsmas_entry *dent; int valid_entries = 0; unsigned long index; int rc; - rc = cxl_endpoint_get_perf_coordinates(port, &ep_c); + rc = cxl_endpoint_get_perf_coordinates(port, ep_c); if (rc) { dev_dbg(&port->dev, "Failed to retrieve ep perf coordinates.\n"); return rc; @@ -213,10 +220,11 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, xa_for_each(dsmas_xa, index, dent) { int qos_class; - cxl_coordinates_combine(&dent->coord, &dent->coord, &ep_c); + cxl_coordinates_combine(dent->coord, dent->coord, ep_c); dent->entries = 1; - rc = cxl_root->ops->qos_class(cxl_root, &dent->coord, 1, - &qos_class); + rc = cxl_root->ops->qos_class(cxl_root, + &dent->coord[ACCESS_COORDINATE_CPU], + 1, &qos_class); if (rc != 1) continue; @@ -233,14 +241,17 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, static void update_perf_entry(struct device *dev, struct dsmas_entry *dent, struct cxl_dpa_perf *dpa_perf) { + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + dpa_perf->coord[i] = dent->coord[i]; dpa_perf->dpa_range = dent->dpa_range; - dpa_perf->coord = dent->coord; dpa_perf->qos_class = dent->qos_class; dev_dbg(dev, "DSMAS: dpa: %#llx qos: %d read_bw: %d write_bw %d read_lat: %d write_lat: %d\n", dent->dpa_range.start, dpa_perf->qos_class, - dent->coord.read_bandwidth, dent->coord.write_bandwidth, - dent->coord.read_latency, dent->coord.write_latency); + dent->coord[ACCESS_COORDINATE_CPU].read_bandwidth, + dent->coord[ACCESS_COORDINATE_CPU].write_bandwidth, + dent->coord[ACCESS_COORDINATE_CPU].read_latency, + dent->coord[ACCESS_COORDINATE_CPU].write_latency); } static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, @@ -477,10 +488,11 @@ static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, xa_for_each(&port->dports, index, dport) { if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || - dsp_id == dport->port_id) - cxl_access_coordinate_set(&dport->sw_coord, + dsp_id == dport->port_id) { + cxl_access_coordinate_set(dport->coord, sslbis->data_type, val); + } } } @@ -502,6 +514,21 @@ void cxl_switch_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(cxl_switch_parse_cdat, CXL); +static void __cxl_coordinates_combine(struct access_coordinate *out, + struct access_coordinate *c1, + struct access_coordinate *c2) +{ + if (c1->write_bandwidth && c2->write_bandwidth) + out->write_bandwidth = min(c1->write_bandwidth, + c2->write_bandwidth); + out->write_latency = c1->write_latency + c2->write_latency; + + if (c1->read_bandwidth && c2->read_bandwidth) + out->read_bandwidth = min(c1->read_bandwidth, + c2->read_bandwidth); + out->read_latency = c1->read_latency + c2->read_latency; +} + /** * cxl_coordinates_combine - Combine the two input coordinates * @@ -513,15 +540,8 @@ void cxl_coordinates_combine(struct access_coordinate *out, struct access_coordinate *c1, struct access_coordinate *c2) { - if (c1->write_bandwidth && c2->write_bandwidth) - out->write_bandwidth = min(c1->write_bandwidth, - c2->write_bandwidth); - out->write_latency = c1->write_latency + c2->write_latency; - - if (c1->read_bandwidth && c2->read_bandwidth) - out->read_bandwidth = min(c1->read_bandwidth, - c2->read_bandwidth); - out->read_latency = c1->read_latency + c2->read_latency; + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + __cxl_coordinates_combine(&out[i], &c1[i], &c2[i]); } MODULE_IMPORT_NS(CXL); @@ -558,12 +578,12 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, /* Get total bandwidth and the worst latency for the cxl region */ cxlr->coord[i].read_latency = max_t(unsigned int, cxlr->coord[i].read_latency, - perf->coord.read_latency); + perf->coord[i].read_latency); cxlr->coord[i].write_latency = max_t(unsigned int, cxlr->coord[i].write_latency, - perf->coord.write_latency); - cxlr->coord[i].read_bandwidth += perf->coord.read_bandwidth; - cxlr->coord[i].write_bandwidth += perf->coord.write_bandwidth; + perf->coord[i].write_latency); + cxlr->coord[i].read_bandwidth += perf->coord[i].read_bandwidth; + cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth; } } diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c7c00eb373af..801c4018a1dd 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2133,6 +2133,29 @@ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) } EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL); +static void add_latency(struct access_coordinate *c, long latency) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].write_latency += latency; + c[i].read_latency += latency; + } +} + +static void set_min_bandwidth(struct access_coordinate *c, unsigned int bw) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].write_bandwidth = min(c[i].write_bandwidth, bw); + c[i].read_bandwidth = min(c[i].read_bandwidth, bw); + } +} + +static void set_access_coordinates(struct access_coordinate *out, + struct access_coordinate *in) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + out[i] = in[i]; +} + static bool parent_port_is_cxl_root(struct cxl_port *port) { return is_cxl_root(to_cxl_port(port->dev.parent)); @@ -2149,9 +2172,15 @@ static bool parent_port_is_cxl_root(struct cxl_port *port) int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct access_coordinate *coord) { - struct access_coordinate c = { - .read_bandwidth = UINT_MAX, - .write_bandwidth = UINT_MAX, + struct access_coordinate c[] = { + { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }, + { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }, }; struct cxl_port *iter = port; struct cxl_dport *dport; @@ -2178,14 +2207,13 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, * have CDAT and therefore needs to be skipped. */ if (!is_cxl_root) - cxl_coordinates_combine(&c, &c, &dport->sw_coord); - c.write_latency += dport->link_latency; - c.read_latency += dport->link_latency; + cxl_coordinates_combine(c, c, dport->coord); + add_latency(c, dport->link_latency); } while (!is_cxl_root); dport = iter->parent_dport; /* Retrieve HB coords */ - cxl_coordinates_combine(&c, &c, dport->hb_coord); + cxl_coordinates_combine(c, c, dport->coord); /* Get the calculated PCI paths bandwidth */ pdev = to_pci_dev(port->uport_dev->parent); @@ -2194,10 +2222,8 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, return -ENXIO; bw /= BITS_PER_BYTE; - c.write_bandwidth = min(c.write_bandwidth, bw); - c.read_bandwidth = min(c.read_bandwidth, bw); - - *coord = c; + set_min_bandwidth(c, bw); + set_access_coordinates(coord, c); return 0; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ed02373ce3d9..036d17db68e0 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -663,8 +663,7 @@ struct cxl_rcrb_info { * @rch: Indicate whether this dport was enumerated in RCH or VH mode * @port: reference to cxl_port that contains this downstream port * @regs: Dport parsed register blocks - * @sw_coord: access coordinates (performance) for switch from CDAT - * @hb_coord: access coordinates (performance) from ACPI generic port (host bridge) + * @coord: access coordinates (bandwidth and latency performance attributes) * @link_latency: calculated PCIe downstream latency */ struct cxl_dport { @@ -675,8 +674,7 @@ struct cxl_dport { bool rch; struct cxl_port *port; struct cxl_regs regs; - struct access_coordinate sw_coord; - struct access_coordinate hb_coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; long link_latency; }; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 20fb3b35e89e..36cee9c30ceb 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -401,7 +401,7 @@ enum cxl_devtype { */ struct cxl_dpa_perf { struct range dpa_range; - struct access_coordinate coord; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; int qos_class; }; diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 908e0d083936..61c69297e797 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -986,10 +986,12 @@ static void dpa_perf_setup(struct cxl_port *endpoint, struct range *range, { dpa_perf->qos_class = FAKE_QTG_ID; dpa_perf->dpa_range = *range; - dpa_perf->coord.read_latency = 500; - dpa_perf->coord.write_latency = 500; - dpa_perf->coord.read_bandwidth = 1000; - dpa_perf->coord.write_bandwidth = 1000; + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + dpa_perf->coord[i].read_latency = 500; + dpa_perf->coord[i].write_latency = 500; + dpa_perf->coord[i].read_bandwidth = 1000; + dpa_perf->coord[i].write_bandwidth = 1000; + } } static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) From 7bcf809b1e7889ab7e75fe1fcf8f1a98332f36d2 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:16 -0700 Subject: [PATCH 083/270] cxl: Add checks to access_coordinate calculation to fail missing data Jonathan noted that when the coordinates for host bridge and switches can be 0s if no actual data are retrieved and the calculation continues. The resulting number would be inaccurate. Add checks to ensure that the calculation would complete only if the numbers are valid. While not seen in the wild, issue may show up with a BIOS that reported CXL root ports via Generic Ports (via a PCI handle in the SRAT entry). Fixes: 14a6960b3e92 ("cxl: Add helper function that calculate performance data for downstream ports") Reported-by: Jonathan Cameron Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-6-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 801c4018a1dd..762783bb091a 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2141,6 +2141,18 @@ static void add_latency(struct access_coordinate *c, long latency) } } +static bool coordinates_valid(struct access_coordinate *c) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + if (c[i].read_bandwidth && c[i].write_bandwidth && + c[i].read_latency && c[i].write_latency) + continue; + return false; + } + + return true; +} + static void set_min_bandwidth(struct access_coordinate *c, unsigned int bw) { for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { @@ -2206,13 +2218,18 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, * There's no valid access_coordinate for a root port since RPs do not * have CDAT and therefore needs to be skipped. */ - if (!is_cxl_root) + if (!is_cxl_root) { + if (!coordinates_valid(dport->coord)) + return -EINVAL; cxl_coordinates_combine(c, c, dport->coord); + } add_latency(c, dport->link_latency); } while (!is_cxl_root); dport = iter->parent_dport; /* Retrieve HB coords */ + if (!coordinates_valid(dport->coord)) + return -EINVAL; cxl_coordinates_combine(c, c, dport->coord); /* Get the calculated PCI paths bandwidth */ From 7b1f6b5aaec0f849e19c3e99d4eea75876853cdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 2 Apr 2024 18:50:03 +0300 Subject: [PATCH 084/270] drm/i915/cdclk: Fix CDCLK programming order when pipes are active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we always reprogram CDCLK from the intel_set_cdclk_pre_plane_update() when using squash/crawl. The code only works correctly for the cd2x update or full modeset cases, and it was simply never updated to deal with squash/crawl. If the CDCLK frequency is increasing we must reprogram it before we do anything else that might depend on the new higher frequency, and conversely we must not decrease the frequency until everything that might still depend on the old higher frequency has been dealt with. Since cdclk_state->pipe is only relevant when doing a cd2x update we can't use it to determine the correct sequence during squash/crawl. To that end introduce cdclk_state->disable_pipes which simply indicates that we must perform the update while the pipes are disable (ie. during intel_set_cdclk_pre_plane_update()). Otherwise we use the same old vs. new CDCLK frequency comparsiong as for cd2x updates. The only remaining problem case is when the voltage_level needs to increase due to a DDI port, but the CDCLK frequency is decreasing (and not all pipes are being disabled). The current approach will not bump the voltage level up until after the port has already been enabled, which is too late. But we'll take care of that case separately. v2: Don't break the "must disable pipes case" v3: Keep the on stack 'pipe' for future use Cc: stable@vger.kernel.org Fixes: d62686ba3b54 ("drm/i915/adl_p: CDCLK crawl support for ADL") Reviewed-by: Uma Shankar Reviewed-by: Gustavo Sousa Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240402155016.13733-2-ville.syrjala@linux.intel.com (cherry picked from commit 3aecee90ac12a351905f12dda7643d5b0676d6ca) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_cdclk.c | 7 +++++-- drivers/gpu/drm/i915/display/intel_cdclk.h | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index ed89b86ea625..4833479e2e17 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -2543,7 +2543,7 @@ intel_set_cdclk_pre_plane_update(struct intel_atomic_state *state) if (IS_DG2(i915)) intel_cdclk_pcode_pre_notify(state); - if (pipe == INVALID_PIPE || + if (new_cdclk_state->disable_pipes || old_cdclk_state->actual.cdclk <= new_cdclk_state->actual.cdclk) { drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); @@ -2575,7 +2575,7 @@ intel_set_cdclk_post_plane_update(struct intel_atomic_state *state) if (IS_DG2(i915)) intel_cdclk_pcode_post_notify(state); - if (pipe != INVALID_PIPE && + if (!new_cdclk_state->disable_pipes && old_cdclk_state->actual.cdclk > new_cdclk_state->actual.cdclk) { drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); @@ -3058,6 +3058,7 @@ static struct intel_global_state *intel_cdclk_duplicate_state(struct intel_globa return NULL; cdclk_state->pipe = INVALID_PIPE; + cdclk_state->disable_pipes = false; return &cdclk_state->base; } @@ -3236,6 +3237,8 @@ int intel_modeset_calc_cdclk(struct intel_atomic_state *state) if (ret) return ret; + new_cdclk_state->disable_pipes = true; + drm_dbg_kms(&dev_priv->drm, "Modeset required for cdclk change\n"); } diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.h b/drivers/gpu/drm/i915/display/intel_cdclk.h index 48fd7d39e0cd..71bc032bfef1 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.h +++ b/drivers/gpu/drm/i915/display/intel_cdclk.h @@ -51,6 +51,9 @@ struct intel_cdclk_state { /* bitmask of active pipes */ u8 active_pipes; + + /* update cdclk with pipes disabled */ + bool disable_pipes; }; int intel_crtc_compute_min_cdclk(const struct intel_crtc_state *crtc_state); From 6154cc9177ccea00c89ce0bf93352e474b819ff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 2 Apr 2024 18:50:04 +0300 Subject: [PATCH 085/270] drm/i915/cdclk: Fix voltage_level programming edge case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we only consider the relationship of the old and new CDCLK frequencies when determining whether to do the repgramming from intel_set_cdclk_pre_plane_update() or intel_set_cdclk_post_plane_update(). It is technically possible to have a situation where the CDCLK frequency is decreasing, but the voltage_level is increasing due a DDI port. In this case we should bump the voltage level already in intel_set_cdclk_pre_plane_update() (so that the voltage_level will have been increased by the time the port gets enabled), while leaving the CDCLK frequency unchanged (as active planes/etc. may still depend on it). We can then reduce the CDCLK frequency to its final value from intel_set_cdclk_post_plane_update(). In order to handle that correctly we shall construct a suitable amalgam of the old and new cdclk states in intel_set_cdclk_pre_plane_update(). And we can simply call intel_set_cdclk() unconditionally in both places as it will not do anything if nothing actually changes vs. the current hw state. v2: Handle cdclk_state->disable_pipes v3: Only synchronize the cd2x update against the pipe's vblank when the cdclk frequency is changing during the current commit phase (Gustavo) Cc: stable@vger.kernel.org Cc: Gustavo Sousa Reviewed-by: Uma Shankar Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240402155016.13733-3-ville.syrjala@linux.intel.com (cherry picked from commit 34d127e2bdef73a923aa0dcd95cbc3257ad5af52) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_cdclk.c | 37 ++++++++++++++++------ 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index 4833479e2e17..f672bfd70d45 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -2534,7 +2534,8 @@ intel_set_cdclk_pre_plane_update(struct intel_atomic_state *state) intel_atomic_get_old_cdclk_state(state); const struct intel_cdclk_state *new_cdclk_state = intel_atomic_get_new_cdclk_state(state); - enum pipe pipe = new_cdclk_state->pipe; + struct intel_cdclk_config cdclk_config; + enum pipe pipe; if (!intel_cdclk_changed(&old_cdclk_state->actual, &new_cdclk_state->actual)) @@ -2543,12 +2544,25 @@ intel_set_cdclk_pre_plane_update(struct intel_atomic_state *state) if (IS_DG2(i915)) intel_cdclk_pcode_pre_notify(state); - if (new_cdclk_state->disable_pipes || - old_cdclk_state->actual.cdclk <= new_cdclk_state->actual.cdclk) { - drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + if (new_cdclk_state->disable_pipes) { + cdclk_config = new_cdclk_state->actual; + pipe = INVALID_PIPE; + } else { + if (new_cdclk_state->actual.cdclk >= old_cdclk_state->actual.cdclk) { + cdclk_config = new_cdclk_state->actual; + pipe = new_cdclk_state->pipe; + } else { + cdclk_config = old_cdclk_state->actual; + pipe = INVALID_PIPE; + } - intel_set_cdclk(i915, &new_cdclk_state->actual, pipe); + cdclk_config.voltage_level = max(new_cdclk_state->actual.voltage_level, + old_cdclk_state->actual.voltage_level); } + + drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + + intel_set_cdclk(i915, &cdclk_config, pipe); } /** @@ -2566,7 +2580,7 @@ intel_set_cdclk_post_plane_update(struct intel_atomic_state *state) intel_atomic_get_old_cdclk_state(state); const struct intel_cdclk_state *new_cdclk_state = intel_atomic_get_new_cdclk_state(state); - enum pipe pipe = new_cdclk_state->pipe; + enum pipe pipe; if (!intel_cdclk_changed(&old_cdclk_state->actual, &new_cdclk_state->actual)) @@ -2576,11 +2590,14 @@ intel_set_cdclk_post_plane_update(struct intel_atomic_state *state) intel_cdclk_pcode_post_notify(state); if (!new_cdclk_state->disable_pipes && - old_cdclk_state->actual.cdclk > new_cdclk_state->actual.cdclk) { - drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + new_cdclk_state->actual.cdclk < old_cdclk_state->actual.cdclk) + pipe = new_cdclk_state->pipe; + else + pipe = INVALID_PIPE; - intel_set_cdclk(i915, &new_cdclk_state->actual, pipe); - } + drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + + intel_set_cdclk(i915, &new_cdclk_state->actual, pipe); } static int intel_pixel_rate_to_cdclk(const struct intel_crtc_state *crtc_state) From 12bcd9108f9d3b8d4b5f4418bd16df4628b6fa8f Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Mon, 1 Apr 2024 11:26:53 +0530 Subject: [PATCH 086/270] drm/i915/hdcp: Fix get remote hdcp capability function HDCP 1.x capability needs to be checked even if setup is not HDCP 2.x capable. --v2 -Assign hdcp_capable and hdcp2_capable to false [Chaitanya] --v3 -Fix variable assignment [Chaitanya] Fixes: 813cca96e4ac ("drm/i915/hdcp: Add new remote capability check shim function") Signed-off-by: Suraj Kandpal Reviewed-by: Chaitanya Kumar Borah Signed-off-by: Animesh Manna Link: https://patchwork.freedesktop.org/patch/msgid/20240401055652.276785-2-suraj.kandpal@intel.com (cherry picked from commit 6809f9246d43f7cb07310ca6a3deb7aa1c0ea938) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp_hdcp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_hdcp.c b/drivers/gpu/drm/i915/display/intel_dp_hdcp.c index b98a87883fef..9db43bd81ce2 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_dp_hdcp.c @@ -691,12 +691,15 @@ int intel_dp_hdcp_get_remote_capability(struct intel_connector *connector, u8 bcaps; int ret; + *hdcp_capable = false; + *hdcp2_capable = false; if (!intel_encoder_is_mst(connector->encoder)) return -EINVAL; ret = _intel_dp_hdcp2_get_capability(aux, hdcp2_capable); if (ret) - return ret; + drm_dbg_kms(&i915->drm, + "HDCP2 DPCD capability read failed err: %d\n", ret); ret = intel_dp_hdcp_read_bcaps(aux, i915, &bcaps); if (ret) From 152191e5e94bba55c938c18688e66c7276b765a7 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 29 Mar 2024 16:53:05 -0700 Subject: [PATCH 087/270] drm/i915/guc: Fix the fix for reset lock confusion The previous fix for the circlular lock splat about the busyness worker wasn't quite complete. Even though the reset-in-progress flag is cleared at the start of intel_uc_reset_finish, the entire function is still inside the reset mutex lock. Not sure why the patch appeared to fix the issue both locally and in CI. However, it is now back again. There is a further complication that the wedge code path within intel_gt_reset() jumps around so much that it results in nested reset_prepare/_finish calls. That is, the call sequence is: intel_gt_reset | reset_prepare | __intel_gt_set_wedged | | reset_prepare | | reset_finish | reset_finish The nested finish means that even if the clear of the in-progress flag was moved to the end of _finish, it would still be clear for the entire second call. Surprisingly, this does not seem to be causing any other problems at present. As an aside, a wedge on fini does not call the finish functions at all. The reset_in_progress flag is left set (twice). So instead of trying to cancel the worker anywhere at all in the reset path, just add a cancel to intel_guc_submission_fini instead. Note that it is not a problem if the worker is still active during a reset. Either it will run before the reset path starts locking things and will simply block the reset code for a tiny amount of time. Or it will run after the locks have been acquired and will early exit due to the try-lock. Also, do not use the reset-in-progress flag to decide whether a synchronous cancel is safe (from a lockdep perspective) or not. Instead, use the actual reset mutex state (both the genuine one and the custom rolled BACKOFF one). Fixes: 0e00a8814eec ("drm/i915/guc: Avoid circular locking issue on busyness flush") Signed-off-by: John Harrison Cc: Zhanjun Dong Cc: John Harrison Cc: Andi Shyti Cc: Daniel Vetter Cc: Daniel Vetter Cc: Rodrigo Vivi Cc: Nirmoy Das Cc: Tvrtko Ursulin Cc: Umesh Nerlige Ramappa Cc: Andrzej Hajda Cc: Matt Roper Cc: Jonathan Cavitt Cc: Prathap Kumar Valsan Cc: Alan Previn Cc: Madhumitha Tolakanahalli Pradeep Cc: Daniele Ceraolo Spurio Cc: Ashutosh Dixit Cc: Dnyaneshwar Bhadane Reviewed-by: Nirmoy Das Reviewed-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240329235306.1559639-1-John.C.Harrison@Intel.com (cherry picked from commit 3563d855312acedcd445a3767f0cb07906f1c26f) Signed-off-by: Rodrigo Vivi --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 23 ++++++++----------- drivers/gpu/drm/i915/gt/uc/intel_uc.c | 4 ++++ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index f3dcae4b9d45..0f83c6d4376f 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1403,14 +1403,17 @@ static void guc_cancel_busyness_worker(struct intel_guc *guc) * Trying to pass a 'need_sync' or 'in_reset' flag all the way down through * every possible call stack is unfeasible. It would be too intrusive to many * areas that really don't care about the GuC backend. However, there is the - * 'reset_in_progress' flag available, so just use that. + * I915_RESET_BACKOFF flag and the gt->reset.mutex can be tested for is_locked. + * So just use those. Note that testing both is required due to the hideously + * complex nature of the i915 driver's reset code paths. * * And note that in the case of a reset occurring during driver unload - * (wedge_on_fini), skipping the cancel in _prepare (when the reset flag is set - * is fine because there is another cancel in _finish (when the reset flag is - * not). + * (wedged_on_fini), skipping the cancel in reset_prepare/reset_fini (when the + * reset flag/mutex are set) is fine because there is another explicit cancel in + * intel_guc_submission_fini (when the reset flag/mutex are not). */ - if (guc_to_gt(guc)->uc.reset_in_progress) + if (mutex_is_locked(&guc_to_gt(guc)->reset.mutex) || + test_bit(I915_RESET_BACKOFF, &guc_to_gt(guc)->reset.flags)) cancel_delayed_work(&guc->timestamp.work); else cancel_delayed_work_sync(&guc->timestamp.work); @@ -1424,8 +1427,6 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) unsigned long flags; ktime_t unused; - guc_cancel_busyness_worker(guc); - spin_lock_irqsave(&guc->timestamp.lock, flags); guc_update_pm_timestamp(guc, &unused); @@ -2004,13 +2005,6 @@ void intel_guc_submission_cancel_requests(struct intel_guc *guc) void intel_guc_submission_reset_finish(struct intel_guc *guc) { - /* - * Ensure the busyness worker gets cancelled even on a fatal wedge. - * Note that reset_prepare is not allowed to because it confuses lockdep. - */ - if (guc_submission_initialized(guc)) - guc_cancel_busyness_worker(guc); - /* Reset called during driver load or during wedge? */ if (unlikely(!guc_submission_initialized(guc) || !intel_guc_is_fw_running(guc) || @@ -2136,6 +2130,7 @@ void intel_guc_submission_fini(struct intel_guc *guc) if (!guc->submission_initialized) return; + guc_fini_engine_stats(guc); guc_flush_destroyed_contexts(guc); guc_lrc_desc_pool_destroy_v69(guc); i915_sched_engine_put(guc->sched_engine); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c index 6dfe5d9456c6..399bc319180b 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c @@ -637,6 +637,10 @@ void intel_uc_reset_finish(struct intel_uc *uc) { struct intel_guc *guc = &uc->guc; + /* + * NB: The wedge code path results in prepare -> prepare -> finish -> finish. + * So this function is sometimes called with the in-progress flag not set. + */ uc->reset_in_progress = false; /* Firmware expected to be running when this function is called */ From e3d4ead4d48c05355bd3b99c8162428f68c3c1a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:26 +0300 Subject: [PATCH 088/270] drm/i915/psr: Disable PSR when bigjoiner is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bigjoiner seem to be causing all kinds of grief to the PSR code currently. I don't believe there is any hardware issue but the code simply not handling this correctly. For now just disable PSR when bigjoiner is needed. Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-3-ville.syrjala@linux.intel.com Reviewed-by: Arun R Murthy Acked-by: Jouni Högander Signed-off-by: Ville Syrjälä (cherry picked from commit 372fa0c79d3f289f813d8001e0a8a96d1011826c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_psr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index b6e539f1342c..aabd018bd737 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -1422,6 +1422,17 @@ void intel_psr_compute_config(struct intel_dp *intel_dp, return; } + /* + * FIXME figure out what is wrong with PSR+bigjoiner and + * fix it. Presumably something related to the fact that + * PSR is a transcoder level feature. + */ + if (crtc_state->bigjoiner_pipes) { + drm_dbg_kms(&dev_priv->drm, + "PSR disabled due to bigjoiner\n"); + return; + } + if (CAN_PANEL_REPLAY(intel_dp)) crtc_state->has_panel_replay = true; else From 0653d501409eeb9f1deb7e4c12e4d0d2c9f1cba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:27 +0300 Subject: [PATCH 089/270] drm/i915: Disable port sync when bigjoiner is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current modeset sequence can't handle port sync and bigjoiner at the same time. Refuse port sync when bigjoiner is needed, at least until we fix the modeset sequence. v2: Add a FIXME (Vandite) Cc: stable@vger.kernel.org Tested-by: Vidya Srinivas Reviewed-by: Vandita Kulkarni Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-4-ville.syrjala@linux.intel.com Signed-off-by: Ville Syrjälä (cherry picked from commit b37e1347b991459c38c56ec2476087854a4f720b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_ddi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index c587a8efeafc..c17462b4c2ac 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -4256,7 +4256,12 @@ static bool m_n_equal(const struct intel_link_m_n *m_n_1, static bool crtcs_port_sync_compatible(const struct intel_crtc_state *crtc_state1, const struct intel_crtc_state *crtc_state2) { + /* + * FIXME the modeset sequence is currently wrong and + * can't deal with bigjoiner + port sync at the same time. + */ return crtc_state1->hw.active && crtc_state2->hw.active && + !crtc_state1->bigjoiner_pipes && !crtc_state2->bigjoiner_pipes && crtc_state1->output_types == crtc_state2->output_types && crtc_state1->output_format == crtc_state2->output_format && crtc_state1->lane_count == crtc_state2->lane_count && From 4a36e46df7aa781c756f09727d37dc2783f1ee75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:28 +0300 Subject: [PATCH 090/270] drm/i915: Disable live M/N updates when using bigjoiner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All joined pipes share the same transcoder/timing generator. Currently we just do the commits per-pipe, which doesn't really work if we need to change the timings at the same time. For now just disable live M/N updates when bigjoiner is needed. Cc: stable@vger.kernel.org Tested-by: Vidya Srinivas Reviewed-by: Arun R Murthy Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-5-ville.syrjala@linux.intel.com Signed-off-by: Ville Syrjälä (cherry picked from commit ef79820db723a2a7c229a7251c12859e7e25a247) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index abd62bebc46d..e583515f9b25 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -2725,7 +2725,11 @@ intel_dp_drrs_compute_config(struct intel_connector *connector, intel_panel_downclock_mode(connector, &pipe_config->hw.adjusted_mode); int pixel_clock; - if (has_seamless_m_n(connector)) + /* + * FIXME all joined pipes share the same transcoder. + * Need to account for that when updating M/N live. + */ + if (has_seamless_m_n(connector) && !pipe_config->bigjoiner_pipes) pipe_config->update_m_n = true; if (!can_enable_drrs(connector, pipe_config, downclock_mode)) { From dcd8992e47f13afb5c11a61e8d9c141c35e23751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:29 +0300 Subject: [PATCH 091/270] drm/i915/vrr: Disable VRR when using bigjoiner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All joined pipes share the same transcoder/timing generator. Currently we just do the commits per-pipe, which doesn't really work if we need to change switch between non-VRR and VRR timings generators on the fly, or even when sending the push to the transcoder. For now just disable VRR when bigjoiner is needed. Cc: stable@vger.kernel.org Tested-by: Vidya Srinivas Reviewed-by: Vandita Kulkarni Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-6-ville.syrjala@linux.intel.com Signed-off-by: Ville Syrjälä (cherry picked from commit f9d5e51db65652dbd8a2102fd7619440e3599fd2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_vrr.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_vrr.c b/drivers/gpu/drm/i915/display/intel_vrr.c index eb5bd0743902..f542ee1db1d9 100644 --- a/drivers/gpu/drm/i915/display/intel_vrr.c +++ b/drivers/gpu/drm/i915/display/intel_vrr.c @@ -117,6 +117,13 @@ intel_vrr_compute_config(struct intel_crtc_state *crtc_state, const struct drm_display_info *info = &connector->base.display_info; int vmin, vmax; + /* + * FIXME all joined pipes share the same transcoder. + * Need to account for that during VRR toggle/push/etc. + */ + if (crtc_state->bigjoiner_pipes) + return; + if (adjusted_mode->flags & DRM_MODE_FLAG_INTERLACE) return; From 4fe82aedeb8a8cb09bfa60f55ab57b5c10a74ac4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Apr 2024 18:11:09 +0100 Subject: [PATCH 092/270] io_uring/net: restore msg_control on sendzc retry cac9e4418f4cb ("io_uring/net: save msghdr->msg_control for retries") reinstatiates msg_control before every __sys_sendmsg_sock(), since the function can overwrite the value in msghdr. We need to do same for zerocopy sendmsg. Cc: stable@vger.kernel.org Fixes: 493108d95f146 ("io_uring/net: zerocopy sendmsg") Link: https://github.com/axboe/liburing/issues/1067 Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cc1d5d9df0576fa66ddad4420d240a98a020b267.1712596179.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index 1e7665ff6ef7..4afb475d4197 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1276,6 +1276,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; + kmsg->msg.msg_control_user = sr->msg_control; } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) From 80e9963fb3b5509dfcabe9652d56bf4b35542055 Mon Sep 17 00:00:00 2001 From: Nianyao Tang Date: Sat, 6 Apr 2024 02:27:37 +0000 Subject: [PATCH 093/270] irqchip/gic-v3-its: Fix VSYNC referencing an unmapped VPE on GIC v4.1 As per the GICv4.1 spec (Arm IHI 0069H, 5.3.19): "A VMAPP with {V, Alloc}=={0, x} is self-synchronizing, This means the ITS command queue does not show the command as consumed until all of its effects are completed." Furthermore, VSYNC is allowed to deliver an SError when referencing a non existent VPE. By these definitions, a VMAPP followed by a VSYNC is a bug, as the later references a VPE that has been unmapped by the former. Fix it by eliding the VSYNC in this scenario. Fixes: 64edfaa9a234 ("irqchip/gic-v4.1: Implement the v4.1 flavour of VMAPP") Signed-off-by: Nianyao Tang Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20240406022737.3898763-1-tangnianyao@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index fca888b36680..2a537cbfcb07 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -786,6 +786,7 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, struct its_cmd_block *cmd, struct its_cmd_desc *desc) { + struct its_vpe *vpe = valid_vpe(its, desc->its_vmapp_cmd.vpe); unsigned long vpt_addr, vconf_addr; u64 target; bool alloc; @@ -798,6 +799,11 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, if (is_v4_1(its)) { alloc = !atomic_dec_return(&desc->its_vmapp_cmd.vpe->vmapp_count); its_encode_alloc(cmd, alloc); + /* + * Unmapping a VPE is self-synchronizing on GICv4.1, + * no need to issue a VSYNC. + */ + vpe = NULL; } goto out; @@ -832,7 +838,7 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, out: its_fixup_cmd(cmd); - return valid_vpe(its, desc->its_vmapp_cmd.vpe); + return vpe; } static struct its_vpe *its_build_vmapti_cmd(struct its_node *its, From 4370b673ccf240bf7587b0cb8e6726a5ccaf1f17 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Thu, 28 Mar 2024 14:27:56 +0000 Subject: [PATCH 094/270] MIPS: scall: Save thread_info.syscall unconditionally on entry thread_info.syscall is used by syscall_get_nr to supply syscall nr over a thread stack frame. Previously, thread_info.syscall is only saved at syscall_trace_enter when syscall tracing is enabled. However rest of the kernel code do expect syscall_get_nr to be available without syscall tracing. The previous design breaks collect_syscall. Move saving process to syscall entry to fix it. Reported-by: Xi Ruoyao Link: https://github.com/util-linux/util-linux/issues/2867 Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/ptrace.h | 2 +- arch/mips/kernel/asm-offsets.c | 1 + arch/mips/kernel/ptrace.c | 15 ++++++--------- arch/mips/kernel/scall32-o32.S | 23 +++++++++++++---------- arch/mips/kernel/scall64-n32.S | 3 ++- arch/mips/kernel/scall64-n64.S | 3 ++- arch/mips/kernel/scall64-o32.S | 33 +++++++++++++++++---------------- 7 files changed, 42 insertions(+), 38 deletions(-) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index d14d0e37ad02..4a2b40ce39e0 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -159,7 +159,7 @@ extern unsigned long exception_ip(struct pt_regs *regs); #define exception_ip(regs) exception_ip(regs) #define profile_pc(regs) instruction_pointer(regs) -extern asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall); +extern asmlinkage long syscall_trace_enter(struct pt_regs *regs); extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); extern void die(const char *, struct pt_regs *) __noreturn; diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index d1b11f66f748..cb1045ebab06 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -101,6 +101,7 @@ void output_thread_info_defines(void) OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PRE_COUNT, thread_info, preempt_count); OFFSET(TI_REGS, thread_info, regs); + OFFSET(TI_SYSCALL, thread_info, syscall); DEFINE(_THREAD_SIZE, THREAD_SIZE); DEFINE(_THREAD_MASK, THREAD_MASK); DEFINE(_IRQ_STACK_SIZE, IRQ_STACK_SIZE); diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 59288c13b581..61503a36067e 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -1317,16 +1317,13 @@ long arch_ptrace(struct task_struct *child, long request, * Notification of system call entry/exit * - triggered by current->work.syscall_trace */ -asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) +asmlinkage long syscall_trace_enter(struct pt_regs *regs) { user_exit(); - current_thread_info()->syscall = syscall; - if (test_thread_flag(TIF_SYSCALL_TRACE)) { if (ptrace_report_syscall_entry(regs)) return -1; - syscall = current_thread_info()->syscall; } #ifdef CONFIG_SECCOMP @@ -1335,7 +1332,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) struct seccomp_data sd; unsigned long args[6]; - sd.nr = syscall; + sd.nr = current_thread_info()->syscall; sd.arch = syscall_get_arch(current); syscall_get_arguments(current, regs, args); for (i = 0; i < 6; i++) @@ -1345,23 +1342,23 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) ret = __secure_computing(&sd); if (ret == -1) return ret; - syscall = current_thread_info()->syscall; } #endif if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[2]); - audit_syscall_entry(syscall, regs->regs[4], regs->regs[5], + audit_syscall_entry(current_thread_info()->syscall, + regs->regs[4], regs->regs[5], regs->regs[6], regs->regs[7]); /* * Negative syscall numbers are mistaken for rejected syscalls, but * won't have had the return value set appropriately, so we do so now. */ - if (syscall < 0) + if (current_thread_info()->syscall < 0) syscall_set_return_value(current, regs, -ENOSYS, 0); - return syscall; + return current_thread_info()->syscall; } /* diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 18dc9b345056..2c604717e630 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -77,6 +77,18 @@ loads_done: PTR_WD load_a7, bad_stack_a7 .previous + /* + * syscall number is in v0 unless we called syscall(__NR_###) + * where the real syscall number is in a0 + */ + subu t2, v0, __NR_O32_Linux + bnez t2, 1f /* __NR_syscall at offset 0 */ + LONG_S a0, TI_SYSCALL($28) # Save a0 as syscall number + b 2f +1: + LONG_S v0, TI_SYSCALL($28) # Save v0 as syscall number +2: + lw t0, TI_FLAGS($28) # syscall tracing enabled? li t1, _TIF_WORK_SYSCALL_ENTRY and t0, t1 @@ -114,16 +126,7 @@ syscall_trace_entry: SAVE_STATIC move a0, sp - /* - * syscall number is in v0 unless we called syscall(__NR_###) - * where the real syscall number is in a0 - */ - move a1, v0 - subu t2, v0, __NR_O32_Linux - bnez t2, 1f /* __NR_syscall at offset 0 */ - lw a1, PT_R4(sp) - -1: jal syscall_trace_enter + jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 97456b2ca7dc..97788859238c 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -44,6 +44,8 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd a3, PT_R26(sp) # save a3 for syscall restarting + LONG_S v0, TI_SYSCALL($28) # Store syscall number + li t1, _TIF_WORK_SYSCALL_ENTRY LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 @@ -72,7 +74,6 @@ syscall_common: n32_syscall_trace_entry: SAVE_STATIC move a0, sp - move a1, v0 jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/scall64-n64.S b/arch/mips/kernel/scall64-n64.S index e6264aa62e45..be11ea5cc67e 100644 --- a/arch/mips/kernel/scall64-n64.S +++ b/arch/mips/kernel/scall64-n64.S @@ -46,6 +46,8 @@ NESTED(handle_sys64, PT_SIZE, sp) sd a3, PT_R26(sp) # save a3 for syscall restarting + LONG_S v0, TI_SYSCALL($28) # Store syscall number + li t1, _TIF_WORK_SYSCALL_ENTRY LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 @@ -82,7 +84,6 @@ n64_syscall_exit: syscall_trace_entry: SAVE_STATIC move a0, sp - move a1, v0 jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index d3c2616cba22..7a5abb73e531 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -79,6 +79,22 @@ loads_done: PTR_WD load_a7, bad_stack_a7 .previous + /* + * absolute syscall number is in v0 unless we called syscall(__NR_###) + * where the real syscall number is in a0 + * note: NR_syscall is the first O32 syscall but the macro is + * only defined when compiling with -mabi=32 (CONFIG_32BIT) + * therefore __NR_O32_Linux is used (4000) + */ + + subu t2, v0, __NR_O32_Linux + bnez t2, 1f /* __NR_syscall at offset 0 */ + LONG_S a0, TI_SYSCALL($28) # Save a0 as syscall number + b 2f +1: + LONG_S v0, TI_SYSCALL($28) # Save v0 as syscall number +2: + li t1, _TIF_WORK_SYSCALL_ENTRY LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 @@ -113,22 +129,7 @@ trace_a_syscall: sd a7, PT_R11(sp) # For indirect syscalls move a0, sp - /* - * absolute syscall number is in v0 unless we called syscall(__NR_###) - * where the real syscall number is in a0 - * note: NR_syscall is the first O32 syscall but the macro is - * only defined when compiling with -mabi=32 (CONFIG_32BIT) - * therefore __NR_O32_Linux is used (4000) - */ - .set push - .set reorder - subu t1, v0, __NR_O32_Linux - move a1, v0 - bnez t1, 1f /* __NR_syscall at offset 0 */ - ld a1, PT_R4(sp) /* Arg1 for __NR_syscall case */ - .set pop - -1: jal syscall_trace_enter + jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall From 011d79ef1cfad701c2d8e7e80d8c77523af9c771 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 8 Apr 2024 09:32:34 +0200 Subject: [PATCH 095/270] MAINTAINERS: Change Krzysztof Kozlowski's email address Switch Krzysztof Kozlowski's to @kernel.org account. Link: https://lore.kernel.org/r/20240329174823.74918-1-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski Signed-off-by: Arnd Bergmann --- MAINTAINERS | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d..cb0049030acd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2708,7 +2708,7 @@ F: sound/soc/rockchip/ N: rockchip ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-samsung-soc@vger.kernel.org @@ -5557,7 +5557,7 @@ F: drivers/cpuidle/cpuidle-big_little.c CPUIDLE DRIVER - ARM EXYNOS M: Daniel Lezcano M: Kukjin Kim -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski L: linux-pm@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -9000,7 +9000,7 @@ F: drivers/i2c/muxes/i2c-mux-gpio.c F: include/linux/platform_data/i2c-mux-gpio.h GENERIC GPIO RESET DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski S: Maintained F: drivers/reset/reset-gpio.c @@ -13295,7 +13295,7 @@ F: drivers/iio/adc/max11205.c MAXIM MAX17040 FAMILY FUEL GAUGE DRIVERS R: Iskren Chernev -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski R: Marek Szyprowski R: Matheus Castello L: linux-pm@vger.kernel.org @@ -13305,7 +13305,7 @@ F: drivers/power/supply/max17040_battery.c MAXIM MAX17042 FAMILY FUEL GAUGE DRIVERS R: Hans de Goede -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski R: Marek Szyprowski R: Sebastian Krzyszkowiak R: Purism Kernel Team @@ -13363,7 +13363,7 @@ F: Documentation/devicetree/bindings/power/supply/maxim,max77976.yaml F: drivers/power/supply/max77976_charger.c MAXIM MUIC CHARGER DRIVERS FOR EXYNOS BASED BOARDS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-pm@vger.kernel.org S: Maintained B: mailto:linux-samsung-soc@vger.kernel.org @@ -13374,7 +13374,7 @@ F: drivers/power/supply/max77693_charger.c MAXIM PMIC AND MUIC DRIVERS FOR EXYNOS BASED BOARDS M: Chanwoo Choi -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained B: mailto:linux-samsung-soc@vger.kernel.org @@ -14156,7 +14156,7 @@ F: mm/mm_init.c F: tools/testing/memblock/ MEMORY CONTROLLER DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained B: mailto:krzysztof.kozlowski@linaro.org @@ -15537,7 +15537,7 @@ F: include/uapi/linux/nexthop.h F: net/ipv4/nexthop.c NFC SUBSYSTEM -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/nfc/ @@ -15914,7 +15914,7 @@ F: Documentation/devicetree/bindings/regulator/nxp,pf8x00-regulator.yaml F: drivers/regulator/pf8x00-regulator.c NXP PTN5150A CC LOGIC AND EXTCON DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/extcon/extcon-ptn5150.yaml @@ -16525,7 +16525,7 @@ K: of_overlay_remove OPEN FIRMWARE AND FLATTENED DEVICE TREE BINDINGS M: Rob Herring -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Conor Dooley L: devicetree@vger.kernel.org S: Maintained @@ -17483,7 +17483,7 @@ F: Documentation/devicetree/bindings/pinctrl/renesas,* F: drivers/pinctrl/renesas/ PIN CONTROLLER - SAMSUNG -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Sylwester Nawrocki R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -19451,7 +19451,7 @@ F: Documentation/devicetree/bindings/sound/samsung* F: sound/soc/samsung/ SAMSUNG EXYNOS PSEUDO RANDOM NUMBER GENERATOR (RNG) DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-crypto@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -19486,7 +19486,7 @@ S: Maintained F: drivers/platform/x86/samsung-laptop.c SAMSUNG MULTIFUNCTION PMIC DEVICE DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -19512,7 +19512,7 @@ F: drivers/media/platform/samsung/s3c-camif/ F: include/media/drv-intf/s3c_camif.h SAMSUNG S3FWRN5 NFC DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski S: Maintained F: Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml F: drivers/nfc/s3fwrn5 @@ -19533,7 +19533,7 @@ S: Supported F: drivers/media/i2c/s5k5baf.c SAMSUNG S5P Security SubSystem (SSS) DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Vladimir Zapolskiy L: linux-crypto@vger.kernel.org L: linux-samsung-soc@vger.kernel.org @@ -19555,7 +19555,7 @@ F: Documentation/devicetree/bindings/media/samsung,fimc.yaml F: drivers/media/platform/samsung/exynos4-is/ SAMSUNG SOC CLOCK DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Sylwester Nawrocki M: Chanwoo Choi R: Alim Akhtar @@ -19587,7 +19587,7 @@ F: drivers/net/ethernet/samsung/sxgbe/ SAMSUNG THERMAL DRIVER M: Bartlomiej Zolnierkiewicz -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-pm@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -23782,7 +23782,7 @@ S: Orphan F: drivers/mmc/host/vub300.c W1 DALLAS'S 1-WIRE BUS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski S: Maintained F: Documentation/devicetree/bindings/w1/ F: Documentation/w1/ From 6d029c25b71f2de2838a6f093ce0fa0e69336154 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 9 Apr 2024 15:38:03 +0200 Subject: [PATCH 096/270] selftests/timers/posix_timers: Reimplement check_timer_distribution() check_timer_distribution() runs ten threads in a busy loop and tries to test that the kernel distributes a process posix CPU timer signal to every thread over time. There is not guarantee that this is true even after commit bcb7ee79029d ("posix-timers: Prefer delivery of signals to the current thread") because that commit only avoids waking up the sleeping process leader thread, but that has nothing to do with the actual signal delivery. As the signal is process wide the first thread which observes sigpending and wins the race to lock sighand will deliver the signal. Testing shows that this hangs on a regular base because some threads never win the race. The comment "This primarily tests that the kernel does not favour any one." is wrong. The kernel does favour a thread which hits the timer interrupt when CLOCK_PROCESS_CPUTIME_ID expires. Rewrite the test so it only checks that the group leader sleeping in join() never receives SIGALRM and the thread which burns CPU cycles receives all signals. In older kernels which do not have commit bcb7ee79029d ("posix-timers: Prefer delivery of signals to the current thread") the test-case fails immediately, the very 1st tick wakes the leader up. Otherwise it quickly succeeds after 100 ticks. CI testing wants to use newer selftest versions on stable kernels. In this case the test is guaranteed to fail. So check in the failure case whether the kernel version is less than v6.3 and skip the test result in that case. [ tglx: Massaged change log, renamed the version check helper ] Fixes: e797203fb3ba ("selftests/timers/posix_timers: Test delivery of signals across threads") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240409133802.GD29396@redhat.com --- tools/testing/selftests/kselftest.h | 13 +++ tools/testing/selftests/timers/posix_timers.c | 103 ++++++++---------- 2 files changed, 60 insertions(+), 56 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 541bf192e30e..973b18e156b2 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -51,6 +51,7 @@ #include #include #include +#include #endif #ifndef ARRAY_SIZE @@ -388,4 +389,16 @@ static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...) exit(KSFT_SKIP); } +static inline int ksft_min_kernel_version(unsigned int min_major, + unsigned int min_minor) +{ + unsigned int major, minor; + struct utsname info; + + if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) + ksft_exit_fail_msg("Can't parse kernel version\n"); + + return major > min_major || (major == min_major && minor >= min_minor); +} + #endif /* __KSELFTEST_H */ diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index d49dd3ffd0d9..d86a0e00711e 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -184,80 +184,71 @@ static int check_timer_create(int which) return 0; } -int remain; -__thread int got_signal; +static pthread_t ctd_thread; +static volatile int ctd_count, ctd_failed; -static void *distribution_thread(void *arg) +static void ctd_sighandler(int sig) { - while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); - return NULL; + if (pthread_self() != ctd_thread) + ctd_failed = 1; + ctd_count--; } -static void distribution_handler(int nr) +static void *ctd_thread_func(void *arg) { - if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED)) - __atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED); -} - -/* - * Test that all running threads _eventually_ receive CLOCK_PROCESS_CPUTIME_ID - * timer signals. This primarily tests that the kernel does not favour any one. - */ -static int check_timer_distribution(void) -{ - int err, i; - timer_t id; - const int nthreads = 10; - pthread_t threads[nthreads]; struct itimerspec val = { .it_value.tv_sec = 0, .it_value.tv_nsec = 1000 * 1000, .it_interval.tv_sec = 0, .it_interval.tv_nsec = 1000 * 1000, }; + timer_t id; - remain = nthreads + 1; /* worker threads + this thread */ - signal(SIGALRM, distribution_handler); - err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); - if (err < 0) { - ksft_perror("Can't create timer"); - return -1; - } - err = timer_settime(id, 0, &val, NULL); - if (err < 0) { - ksft_perror("Can't set timer"); - return -1; - } + /* 1/10 seconds to ensure the leader sleeps */ + usleep(10000); - for (i = 0; i < nthreads; i++) { - err = pthread_create(&threads[i], NULL, distribution_thread, - NULL); - if (err) { - ksft_print_msg("Can't create thread: %s (%d)\n", - strerror(errno), errno); - return -1; - } - } + ctd_count = 100; + if (timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id)) + return "Can't create timer\n"; + if (timer_settime(id, 0, &val, NULL)) + return "Can't set timer\n"; - /* Wait for all threads to receive the signal. */ - while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); + while (ctd_count > 0 && !ctd_failed) + ; - for (i = 0; i < nthreads; i++) { - err = pthread_join(threads[i], NULL); - if (err) { - ksft_print_msg("Can't join thread: %s (%d)\n", - strerror(errno), errno); - return -1; - } - } + if (timer_delete(id)) + return "Can't delete timer\n"; - if (timer_delete(id)) { - ksft_perror("Can't delete timer"); - return -1; - } + return NULL; +} - ksft_test_result_pass("check_timer_distribution\n"); +/* + * Test that only the running thread receives the timer signal. + */ +static int check_timer_distribution(void) +{ + const char *errmsg; + + signal(SIGALRM, ctd_sighandler); + + errmsg = "Can't create thread\n"; + if (pthread_create(&ctd_thread, NULL, ctd_thread_func, NULL)) + goto err; + + errmsg = "Can't join thread\n"; + if (pthread_join(ctd_thread, (void **)&errmsg) || errmsg) + goto err; + + if (!ctd_failed) + ksft_test_result_pass("check signal distribution\n"); + else if (ksft_min_kernel_version(6, 3)) + ksft_test_result_fail("check signal distribution\n"); + else + ksft_test_result_skip("check signal distribution (old kernel)\n"); return 0; +err: + ksft_print_msg(errmsg); + return -1; } int main(int argc, char **argv) From 4c08f01934ab67d1d283d5cbaa52b923abcfe4cd Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Sun, 7 Apr 2024 22:28:02 -0400 Subject: [PATCH 097/270] drm/vmwgfx: Enable DMA mappings with SEV Enable DMA mappings in vmwgfx after TTM has been fixed in commit 3bf3710e3718 ("drm/ttm: Add a generic TTM memcpy move for page-based iomem") This enables full guest-backed memory support and in particular allows usage of screen targets as the presentation mechanism. Signed-off-by: Zack Rusin Reported-by: Ye Li Tested-by: Ye Li Fixes: 3b0d6458c705 ("drm/vmwgfx: Refuse DMA operation when SEV encryption is active") Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v6.6+ Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240408022802.358641-1-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index c7d90f96d16a..0a304706e013 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -666,11 +666,12 @@ static int vmw_dma_select_mode(struct vmw_private *dev_priv) [vmw_dma_map_populate] = "Caching DMA mappings.", [vmw_dma_map_bind] = "Giving up DMA mappings early."}; - /* TTM currently doesn't fully support SEV encryption. */ - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) - return -EINVAL; - - if (vmw_force_coherent) + /* + * When running with SEV we always want dma mappings, because + * otherwise ttm tt pool pages will bounce through swiotlb running + * out of available space. + */ + if (vmw_force_coherent || cc_platform_has(CC_ATTR_MEM_ENCRYPT)) dev_priv->map_mode = vmw_dma_alloc_coherent; else if (vmw_restrict_iommu) dev_priv->map_mode = vmw_dma_map_bind; From ff81dade48608363136d52bb2493a6df76458b28 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Wed, 10 Apr 2024 01:35:28 +0800 Subject: [PATCH 098/270] io-uring: correct typo in comment for IOU_F_TWQ_LAZY_WAKE The 'r' key is near to 't' key, that makes 'with' to be 'wirh' ? :) Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240409173531.846714-1-haiyue.wang@intel.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 05df0e399d7c..ac333ea81d31 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -13,7 +13,7 @@ enum { * A hint to not wake right away but delay until there are enough of * tw's queued to match the number of CQEs the task is waiting for. * - * Must not be used wirh requests generating more than one CQE. + * Must not be used with requests generating more than one CQE. * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. */ IOU_F_TWQ_LAZY_WAKE = 1, From 68879386180c0efd5a11e800b0525a01068c9457 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 26 Mar 2024 14:39:20 +0900 Subject: [PATCH 099/270] btrfs: zoned: do not flag ZEROOUT on non-dirty extent buffer Btrfs clears the content of an extent buffer marked as EXTENT_BUFFER_ZONED_ZEROOUT before the bio submission. This mechanism is introduced to prevent a write hole of an extent buffer, which is once allocated, marked dirty, but turns out unnecessary and cleaned up within one transaction operation. Currently, btrfs_clear_buffer_dirty() marks the extent buffer as EXTENT_BUFFER_ZONED_ZEROOUT, and skips the entry function. If this call happens while the buffer is under IO (with the WRITEBACK flag set, without the DIRTY flag), we can add the ZEROOUT flag and clear the buffer's content just before a bio submission. As a result: 1) it can lead to adding faulty delayed reference item which leads to a FS corrupted (EUCLEAN) error, and 2) it writes out cleared tree node on disk The former issue is previously discussed in [1]. The corruption happens when it runs a delayed reference update. So, on-disk data is safe. [1] https://lore.kernel.org/linux-btrfs/3f4f2a0ff1a6c818050434288925bdcf3cd719e5.1709124777.git.naohiro.aota@wdc.com/ The latter one can reach on-disk data. But, as that node is already processed by btrfs_clear_buffer_dirty(), that will be invalidated in the next transaction commit anyway. So, the chance of hitting the corruption is relatively small. Anyway, we should skip flagging ZEROOUT on a non-DIRTY extent buffer, to keep the content under IO intact. Fixes: aa6313e6ff2b ("btrfs: zoned: don't clear dirty flag of extent buffer") CC: stable@vger.kernel.org # 6.8 Link: https://lore.kernel.org/linux-btrfs/oadvdekkturysgfgi4qzuemd57zudeasynswurjxw3ocdfsef6@sjyufeugh63f/ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 61594eaf1f89..df3fe36126f9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4154,7 +4154,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, * The actual zeroout of the buffer will happen later in * btree_csum_one_bio. */ - if (btrfs_is_zoned(fs_info)) { + if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); return; } From 073bda7a541731f41ed08f32d286394236c74005 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 26 Mar 2024 14:39:21 +0900 Subject: [PATCH 100/270] btrfs: zoned: add ASSERT and WARN for EXTENT_BUFFER_ZONED_ZEROOUT handling Add an ASSERT to catch a faulty delayed reference item resulting from prematurely cleared extent buffer. Also, add a WARN to detect if we try to dirty a ZEROOUT buffer again, which is suspicious as its update will be lost. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 ++++++++ fs/btrfs/extent_io.c | 1 + 2 files changed, 9 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index beedd6ed64d3..257d044bca91 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3464,6 +3464,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root_id != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_ref generic_ref = { 0 }; + /* + * Assert that the extent buffer is not cleared due to + * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer + * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for + * detail. + */ + ASSERT(btrfs_header_bytenr(buf) != 0); + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, buf->start, buf->len, parent, btrfs_header_owner(buf)); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index df3fe36126f9..b18034f2ab80 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4193,6 +4193,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { bool subpage = eb->fs_info->nodesize < PAGE_SIZE; From 1db7959aacd905e6487d0478ac01d89f86eb1e51 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Mar 2024 09:16:46 +1030 Subject: [PATCH 101/270] btrfs: do not wait for short bulk allocation [BUG] There is a recent report that when memory pressure is high (including cached pages), btrfs can spend most of its time on memory allocation in btrfs_alloc_page_array() for compressed read/write. [CAUSE] For btrfs_alloc_page_array() we always go alloc_pages_bulk_array(), and even if the bulk allocation failed (fell back to single page allocation) we still retry but with extra memalloc_retry_wait(). If the bulk alloc only returned one page a time, we would spend a lot of time on the retry wait. The behavior was introduced in commit 395cb57e8560 ("btrfs: wait between incomplete batch memory allocations"). [FIX] Although the commit mentioned that other filesystems do the wait, it's not the case at least nowadays. All the mainlined filesystems only call memalloc_retry_wait() if they failed to allocate any page (not only for bulk allocation). If there is any progress, they won't call memalloc_retry_wait() at all. For example, xfs_buf_alloc_pages() would only call memalloc_retry_wait() if there is no allocation progress at all, and the call is not for metadata readahead. So I don't believe we should call memalloc_retry_wait() unconditionally for short allocation. Call memalloc_retry_wait() if it fails to allocate any page for tree block allocation (which goes with __GFP_NOFAIL and may not need the special handling anyway), and reduce the latency for btrfs_alloc_page_array(). Reported-by: Julian Taylor Tested-by: Julian Taylor Link: https://lore.kernel.org/all/8966c095-cbe7-4d22-9784-a647d1bf27c3@1und1.de/ Fixes: 395cb57e8560 ("btrfs: wait between incomplete batch memory allocations") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Sweet Tea Dorminy Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b18034f2ab80..2776112dbdf8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -681,31 +681,21 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, gfp_t extra_gfp) { + const gfp_t gfp = GFP_NOFS | extra_gfp; unsigned int allocated; for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp, - nr_pages, page_array); - - if (allocated == nr_pages) - return 0; - - /* - * During this iteration, no page could be allocated, even - * though alloc_pages_bulk_array() falls back to alloc_page() - * if it could not bulk-allocate. So we must be out of memory. - */ - if (allocated == last) { + allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array); + if (unlikely(allocated == last)) { + /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { __free_page(page_array[i]); page_array[i] = NULL; } return -ENOMEM; } - - memalloc_retry_wait(GFP_NOFS); } return 0; } From 60b703c71fa80de0c2e14af66e57e234019b7da2 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 2 Apr 2024 12:17:16 +0200 Subject: [PATCH 102/270] zonefs: Use str_plural() to fix Coccinelle warning Fixes the following Coccinelle/coccicheck warning reported by string_choices.cocci: opportunity for str_plural(zgroup->g_nr_zones) Signed-off-by: Thorsten Blum Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index c6a124e8d565..964fa7f24003 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1048,7 +1048,7 @@ static int zonefs_init_zgroup(struct super_block *sb, zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", zonefs_zgroup_name(ztype), zgroup->g_nr_zones, - zgroup->g_nr_zones > 1 ? "s" : ""); + str_plural(zgroup->g_nr_zones)); return 0; } From 8bdfb4ea95ca738d33ef71376c21eba20130f2eb Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Tue, 26 Mar 2024 15:32:46 -0400 Subject: [PATCH 103/270] drm/amdkfd: Reset GPU on queue preemption failure Currently, with F32 HWS GPU reset is only when unmap queue fails. However, if compute queue doesn't repond to preemption request in time unmap will return without any error. In this case, only preemption error is logged and Reset is not triggered. Call GPU reset in this case also. Reviewed-by: Alex Deucher Signed-off-by: Harish Kasiviswanathan Reviewed-by: Mukul Joshi Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f4d395e38683..0b655555e167 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2001,6 +2001,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); while (halt_if_hws_hang) schedule(); + kfd_hws_hang(dqm); return -ETIME; } From 65ff8092e4802f96d87d3d7cde146961f5228265 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sat, 23 Mar 2024 20:46:53 -0400 Subject: [PATCH 104/270] drm/amdgpu: always force full reset for SOC21 There are cases where soft reset seems to succeed, but does not, so always use mode1/2 for now. Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc21.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 581a3bd11481..8526282f4da1 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -457,10 +457,8 @@ static bool soc21_need_full_reset(struct amdgpu_device *adev) { switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(11, 0, 0): - return amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC); case IP_VERSION(11, 0, 2): case IP_VERSION(11, 0, 3): - return false; default: return true; } From 4b18a91faf1752f9bd69a4ed3aed2c8f6e5b0528 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Thu, 21 Mar 2024 17:46:36 +0530 Subject: [PATCH 105/270] drm/amdgpu: Refine IB schedule error logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Downgrade to debug information when IBs are skipped. Also, use dev_* to identify the device. Signed-off-by: Lijo Lazar Reviewed-by: Christian König Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 4b3000c21ef2..e4742b65032d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -304,12 +304,15 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) dma_fence_set_error(finished, -ECANCELED); if (finished->error < 0) { - DRM_INFO("Skip scheduling IBs!\n"); + dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)", + ring->name); } else { r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, &fence); if (r) - DRM_ERROR("Error scheduling IBs (%d)\n", r); + dev_err(adev->dev, + "Error scheduling IBs (%d) in ring(%s)", r, + ring->name); } job->job_run_counter++; From 0f1bbcc2bab25d5fb2dfb1ee3e08131437690d3d Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Mon, 25 Mar 2024 13:24:31 +0800 Subject: [PATCH 106/270] drm/amdgpu/umsch: reinitialize write pointer in hw init Otherwise the old one will be used during GPU reset. That's not expected. Signed-off-by: Lang Yu Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c b/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c index 84368cf1e175..bd57896ab85d 100644 --- a/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c @@ -225,6 +225,8 @@ static int umsch_mm_v4_0_ring_start(struct amdgpu_umsch_mm *umsch) WREG32_SOC15(VCN, 0, regVCN_UMSCH_RB_SIZE, ring->ring_size); + ring->wptr = 0; + data = RREG32_SOC15(VCN, 0, regVCN_RB_ENABLE); data &= ~(VCN_RB_ENABLE__AUDIO_RB_EN_MASK); WREG32_SOC15(VCN, 0, regVCN_RB_ENABLE, data); From 8b2be55f4d6c1099d7f629b0ed7535a5be788c83 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 14 Feb 2024 17:55:54 +0530 Subject: [PATCH 107/270] drm/amdgpu: Reset dGPU if suspend got aborted For SOC21 ASICs, there is an issue in re-enabling PM features if a suspend got aborted. In such cases, reset the device during resume phase. This is a workaround till a proper solution is finalized. Signed-off-by: Lijo Lazar Reviewed-by: Alex Deucher Reviewed-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc21.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 8526282f4da1..abe319b0f063 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -867,10 +867,35 @@ static int soc21_common_suspend(void *handle) return soc21_common_hw_fini(adev); } +static bool soc21_need_reset_on_resume(struct amdgpu_device *adev) +{ + u32 sol_reg1, sol_reg2; + + /* Will reset for the following suspend abort cases. + * 1) Only reset dGPU side. + * 2) S3 suspend got aborted and TOS is active. + */ + if (!(adev->flags & AMD_IS_APU) && adev->in_s3 && + !adev->suspend_complete) { + sol_reg1 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81); + msleep(100); + sol_reg2 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81); + + return (sol_reg1 != sol_reg2); + } + + return false; +} + static int soc21_common_resume(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + if (soc21_need_reset_on_resume(adev)) { + dev_info(adev->dev, "S3 suspend aborted, resetting..."); + soc21_asic_reset(adev); + } + return soc21_common_hw_init(adev); } From d4396924c3d44f34d0643f650e70892e07f3677f Mon Sep 17 00:00:00 2001 From: Li Ma Date: Thu, 28 Mar 2024 10:55:10 +0800 Subject: [PATCH 108/270] drm/amd/display: add DCN 351 version for microcode load There is a new DCN veriosn 3.5.1 need to load Signed-off-by: Li Ma Reviewed-by: Yifan Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 71d2d44681b2..173438fc7537 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -148,6 +148,9 @@ MODULE_FIRMWARE(FIRMWARE_NAVI12_DMCU); #define FIRMWARE_DCN_35_DMUB "amdgpu/dcn_3_5_dmcub.bin" MODULE_FIRMWARE(FIRMWARE_DCN_35_DMUB); +#define FIRMWARE_DCN_351_DMUB "amdgpu/dcn_3_5_1_dmcub.bin" +MODULE_FIRMWARE(FIRMWARE_DCN_351_DMUB); + /* Number of bytes in PSP header for firmware. */ #define PSP_HEADER_BYTES 0x100 @@ -4820,9 +4823,11 @@ static int dm_init_microcode(struct amdgpu_device *adev) fw_name_dmub = FIRMWARE_DCN_V3_2_1_DMCUB; break; case IP_VERSION(3, 5, 0): - case IP_VERSION(3, 5, 1): fw_name_dmub = FIRMWARE_DCN_35_DMUB; break; + case IP_VERSION(3, 5, 1): + fw_name_dmub = FIRMWARE_DCN_351_DMUB; + break; default: /* ASIC doesn't support DMUB. */ return 0; From 31729e8c21ecfd671458e02b6511eb68c2225113 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Wed, 27 Mar 2024 13:10:37 +0800 Subject: [PATCH 109/270] drm/amd/pm: fixes a random hang in S4 for SMU v13.0.4/11 While doing multiple S4 stress tests, GC/RLC/PMFW get into an invalid state resulting into hard hangs. Adding a GFX reset as workaround just before sending the MP1_UNLOAD message avoids this failure. Signed-off-by: Tim Huang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c index bb98156b2fa1..949131bd1ecb 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c @@ -226,8 +226,18 @@ static int smu_v13_0_4_system_features_control(struct smu_context *smu, bool en) struct amdgpu_device *adev = smu->adev; int ret = 0; - if (!en && !adev->in_s0ix) + if (!en && !adev->in_s0ix) { + /* Adds a GFX reset as workaround just before sending the + * MP1_UNLOAD message to prevent GC/RLC/PMFW from entering + * an invalid state. + */ + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, + SMU_RESET_MODE_2, NULL); + if (ret) + return ret; + ret = smu_cmn_send_smc_msg(smu, SMU_MSG_PrepareMp1ForUnload, NULL); + } return ret; } From a3a4c0b12346a2493b41c8790d85141844a04e28 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Fri, 22 Mar 2024 12:25:16 -0400 Subject: [PATCH 110/270] drm/amdgpu : Add mes_log_enable to control mes log feature The MES log might slow down the performance for extra step of log the data, disable it by default and introduce a parameter can enable it when necessary Signed-off-by: shaoyunl Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++++- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 7 +++++-- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9c62552bec34..b3b84647207e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -210,6 +210,7 @@ extern int amdgpu_async_gfx_ring; extern int amdgpu_mcbp; extern int amdgpu_discovery; extern int amdgpu_mes; +extern int amdgpu_mes_log_enable; extern int amdgpu_mes_kiq; extern int amdgpu_noretry; extern int amdgpu_force_asic_type; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 80b9642f2bc4..e4277298cf1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -195,6 +195,7 @@ int amdgpu_async_gfx_ring = 1; int amdgpu_mcbp = -1; int amdgpu_discovery = -1; int amdgpu_mes; +int amdgpu_mes_log_enable = 0; int amdgpu_mes_kiq; int amdgpu_noretry = -1; int amdgpu_force_asic_type = -1; @@ -667,6 +668,15 @@ MODULE_PARM_DESC(mes, "Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)"); module_param_named(mes, amdgpu_mes, int, 0444); +/** + * DOC: mes_log_enable (int) + * Enable Micro Engine Scheduler log. This is used to enable/disable MES internal log. + * (0 = disabled (default), 1 = enabled) + */ +MODULE_PARM_DESC(mes_log_enable, + "Enable Micro Engine Scheduler log (0 = disabled (default), 1 = enabled)"); +module_param_named(mes_log_enable, amdgpu_mes_log_enable, int, 0444); + /** * DOC: mes_kiq (int) * Enable Micro Engine Scheduler KIQ. This is a new engine pipe for kiq. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index a98e03e0a51f..cfd613ea39b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -102,6 +102,9 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) { int r; + if (!amdgpu_mes_log_enable) + return 0; + r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, &adev->mes.event_log_gpu_obj, @@ -1565,7 +1568,7 @@ void amdgpu_debugfs_mes_event_log_init(struct amdgpu_device *adev) #if defined(CONFIG_DEBUG_FS) struct drm_minor *minor = adev_to_drm(adev)->primary; struct dentry *root = minor->debugfs_root; - if (adev->enable_mes) + if (adev->enable_mes && amdgpu_mes_log_enable) debugfs_create_file("amdgpu_mes_event_log", 0444, root, adev, &amdgpu_debugfs_mes_event_log_fops); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 072c478665ad..63f281a9984d 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -411,8 +411,11 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) mes_set_hw_res_pkt.enable_reg_active_poll = 1; mes_set_hw_res_pkt.enable_level_process_quantum_check = 1; mes_set_hw_res_pkt.oversubscription_timer = 50; - mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; - mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr; + if (amdgpu_mes_log_enable) { + mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; + mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = + mes->event_log_gpu_addr; + } return mes_v11_0_submit_pkt_and_poll_completion(mes, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), From 5b0cd091d905ee9da0a3ecdf06b9cbdd17ba711d Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Fri, 22 Mar 2024 12:44:55 -0400 Subject: [PATCH 111/270] drm/amdgpu : Increase the mes log buffer size as per new MES FW version From MES version 0x54, the log entry increased and require the log buffer size to be increased. The 16k is maximum size agreed Signed-off-by: shaoyunl Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index cfd613ea39b3..a00cf4756ad0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -105,7 +105,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) if (!amdgpu_mes_log_enable) return 0; - r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, + r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE, PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, &adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, @@ -1552,12 +1552,11 @@ static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr); seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4, - mem, PAGE_SIZE, false); + mem, AMDGPU_MES_LOG_BUFFER_SIZE, false); return 0; } - DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 7d4f93fea937..4c8fc3117ef8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -52,6 +52,7 @@ enum amdgpu_mes_priority_level { #define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */ #define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */ +#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size for MES */ struct amdgpu_mes_funcs; From c5b1ccff26950d50bf2043cb2af9bafb1f08bbaf Mon Sep 17 00:00:00 2001 From: lima1002 Date: Mon, 29 Jan 2024 20:17:54 +0800 Subject: [PATCH 112/270] drm/amd/swsmu: Update smu v14.0.0 headers to be 14.0.1 compatible update ppsmc.h pmfw.h and driver_if.h for smu v14_0_1 Reviewed-by: Alex Deucher Signed-off-by: lima1002 Signed-off-by: Alex Deucher --- .../inc/pmfw_if/smu14_driver_if_v14_0_0.h | 33 +- .../pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h | 55 ++- .../pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h | 18 +- drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h | 1 + .../gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c | 2 +- .../drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c | 347 ++++++++++++++++-- 6 files changed, 413 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h index 5bb7a63c0602..97522c085258 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h @@ -144,6 +144,37 @@ typedef struct { uint32_t MaxGfxClk; } DpmClocks_t; +//Freq in MHz +//Voltage in milli volts with 2 fractional bits +typedef struct { + uint32_t DcfClocks[NUM_DCFCLK_DPM_LEVELS]; + uint32_t DispClocks[NUM_DISPCLK_DPM_LEVELS]; + uint32_t DppClocks[NUM_DPPCLK_DPM_LEVELS]; + uint32_t SocClocks[NUM_SOCCLK_DPM_LEVELS]; + uint32_t VClocks0[NUM_VCN_DPM_LEVELS]; + uint32_t VClocks1[NUM_VCN_DPM_LEVELS]; + uint32_t DClocks0[NUM_VCN_DPM_LEVELS]; + uint32_t DClocks1[NUM_VCN_DPM_LEVELS]; + uint32_t VPEClocks[NUM_VPE_DPM_LEVELS]; + uint32_t FclkClocks_Freq[NUM_FCLK_DPM_LEVELS]; + uint32_t FclkClocks_Voltage[NUM_FCLK_DPM_LEVELS]; + uint32_t SocVoltage[NUM_SOC_VOLTAGE_LEVELS]; + MemPstateTable_t MemPstateTable[NUM_MEM_PSTATE_LEVELS]; + + uint8_t NumDcfClkLevelsEnabled; + uint8_t NumDispClkLevelsEnabled; //Applies to both Dispclk and Dppclk + uint8_t NumSocClkLevelsEnabled; + uint8_t Vcn0ClkLevelsEnabled; //Applies to both Vclk0 and Dclk0 + uint8_t Vcn1ClkLevelsEnabled; //Applies to both Vclk1 and Dclk1 + uint8_t VpeClkLevelsEnabled; + uint8_t NumMemPstatesEnabled; + uint8_t NumFclkLevelsEnabled; + uint8_t spare; + + uint32_t MinGfxClk; + uint32_t MaxGfxClk; +} DpmClocks_t_v14_0_1; + typedef struct { uint16_t CoreFrequency[16]; //Target core frequency [MHz] uint16_t CorePower[16]; //CAC calculated core power [mW] @@ -224,7 +255,7 @@ typedef enum { #define TABLE_CUSTOM_DPM 2 // Called by Driver #define TABLE_BIOS_GPIO_CONFIG 3 // Called by BIOS #define TABLE_DPMCLOCKS 4 // Called by Driver and VBIOS -#define TABLE_SPARE0 5 // Unused +#define TABLE_MOMENTARY_PM 5 // Called by Tools #define TABLE_MODERN_STDBY 6 // Called by Tools for Modern Standby Log #define TABLE_SMU_METRICS 7 // Called by Driver and SMF/PMF #define TABLE_COUNT 8 diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h index 356e0f57a426..ddb625860083 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h @@ -42,7 +42,7 @@ #define FEATURE_EDC_BIT 7 #define FEATURE_PLL_POWER_DOWN_BIT 8 #define FEATURE_VDDOFF_BIT 9 -#define FEATURE_VCN_DPM_BIT 10 +#define FEATURE_VCN_DPM_BIT 10 /* this is for both VCN0 and VCN1 */ #define FEATURE_DS_MPM_BIT 11 #define FEATURE_FCLK_DPM_BIT 12 #define FEATURE_SOCCLK_DPM_BIT 13 @@ -56,9 +56,9 @@ #define FEATURE_DS_GFXCLK_BIT 21 #define FEATURE_DS_SOCCLK_BIT 22 #define FEATURE_DS_LCLK_BIT 23 -#define FEATURE_LOW_POWER_DCNCLKS_BIT 24 // for all DISP clks +#define FEATURE_LOW_POWER_DCNCLKS_BIT 24 #define FEATURE_DS_SHUBCLK_BIT 25 -#define FEATURE_SPARE0_BIT 26 //SPARE +#define FEATURE_RESERVED0_BIT 26 #define FEATURE_ZSTATES_BIT 27 #define FEATURE_IOMMUL2_PG_BIT 28 #define FEATURE_DS_FCLK_BIT 29 @@ -66,8 +66,8 @@ #define FEATURE_DS_MP1CLK_BIT 31 #define FEATURE_WHISPER_MODE_BIT 32 #define FEATURE_SMU_LOW_POWER_BIT 33 -#define FEATURE_SMART_L3_RINSER_BIT 34 -#define FEATURE_SPARE1_BIT 35 //SPARE +#define FEATURE_RESERVED1_BIT 34 /* v14_0_0 SMART_L3_RINSER; v14_0_1 RESERVED1 */ +#define FEATURE_GFX_DEM_BIT 35 /* v14_0_0 SPARE; v14_0_1 GFX_DEM */ #define FEATURE_PSI_BIT 36 #define FEATURE_PROCHOT_BIT 37 #define FEATURE_CPUOFF_BIT 38 @@ -77,11 +77,11 @@ #define FEATURE_PERF_LIMIT_BIT 42 #define FEATURE_CORE_DLDO_BIT 43 #define FEATURE_DVO_BIT 44 -#define FEATURE_DS_VCN_BIT 45 +#define FEATURE_DS_VCN_BIT 45 /* v14_0_1 this is for both VCN0 and VCN1 */ #define FEATURE_CPPC_BIT 46 #define FEATURE_CPPC_PREFERRED_CORES 47 #define FEATURE_DF_CSTATES_BIT 48 -#define FEATURE_SPARE2_BIT 49 //SPARE +#define FEATURE_FAST_PSTATE_CLDO_BIT 49 /* v14_0_0 SPARE */ #define FEATURE_ATHUB_PG_BIT 50 #define FEATURE_VDDOFF_ECO_BIT 51 #define FEATURE_ZSTATES_ECO_BIT 52 @@ -93,8 +93,8 @@ #define FEATURE_DS_IPUCLK_BIT 58 #define FEATURE_DS_VPECLK_BIT 59 #define FEATURE_VPE_DPM_BIT 60 -#define FEATURE_SPARE_61 61 -#define FEATURE_FP_DIDT 62 +#define FEATURE_SMART_L3_RINSER_BIT 61 /* v14_0_0 SPARE*/ +#define FEATURE_PCC_BIT 62 /* v14_0_0 FP_DIDT v14_0_1 PCC_BIT */ #define NUM_FEATURES 63 // Firmware Header/Footer @@ -151,6 +151,43 @@ typedef struct { // MP1_EXT_SCRATCH7 = RTOS Current Job } FwStatus_t; +typedef struct { + // MP1_EXT_SCRATCH0 + uint32_t DpmHandlerID : 8; + uint32_t ActivityMonitorID : 8; + uint32_t DpmTimerID : 8; + uint32_t DpmHubID : 4; + uint32_t DpmHubTask : 4; + // MP1_EXT_SCRATCH1 + uint32_t CclkSyncStatus : 8; + uint32_t ZstateStatus : 4; + uint32_t Cpu1VddOff : 4; + uint32_t DstateFun : 4; + uint32_t DstateDev : 4; + uint32_t GfxOffStatus : 2; + uint32_t Cpu0Off : 2; + uint32_t Cpu1Off : 2; + uint32_t Cpu0VddOff : 2; + // MP1_EXT_SCRATCH2 + uint32_t P2JobHandler :32; + // MP1_EXT_SCRATCH3 + uint32_t PostCode :32; + // MP1_EXT_SCRATCH4 + uint32_t MsgPortBusy :15; + uint32_t RsmuPmiP1Pending : 1; + uint32_t RsmuPmiP2PendingCnt : 8; + uint32_t DfCstateExitPending : 1; + uint32_t Pc6EntryPending : 1; + uint32_t Pc6ExitPending : 1; + uint32_t WarmResetPending : 1; + uint32_t Mp0ClkPending : 1; + uint32_t InWhisperMode : 1; + uint32_t spare2 : 2; + // MP1_EXT_SCRATCH5 + uint32_t IdleMask :32; + // MP1_EXT_SCRATCH6 = RTOS threads' status + // MP1_EXT_SCRATCH7 = RTOS Current Job +} FwStatus_t_v14_0_1; #pragma pack(pop) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h index ca7ce4251482..c4dc5881d8df 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h @@ -72,23 +72,19 @@ #define PPSMC_MSG_SetHardMinSocclkByFreq 0x13 ///< Set hard min for SOC CLK #define PPSMC_MSG_SetSoftMinFclk 0x14 ///< Set hard min for FCLK #define PPSMC_MSG_SetSoftMinVcn0 0x15 ///< Set soft min for VCN0 clocks (VCLK0 and DCLK0) - #define PPSMC_MSG_EnableGfxImu 0x16 ///< Enable GFX IMU - -#define PPSMC_MSG_spare_0x17 0x17 -#define PPSMC_MSG_spare_0x18 0x18 +#define PPSMC_MSG_spare_0x17 0x17 ///< Get GFX clock frequency +#define PPSMC_MSG_spare_0x18 0x18 ///< Get FCLK frequency #define PPSMC_MSG_AllowGfxOff 0x19 ///< Inform PMFW of allowing GFXOFF entry #define PPSMC_MSG_DisallowGfxOff 0x1A ///< Inform PMFW of disallowing GFXOFF entry #define PPSMC_MSG_SetSoftMaxGfxClk 0x1B ///< Set soft max for GFX CLK #define PPSMC_MSG_SetHardMinGfxClk 0x1C ///< Set hard min for GFX CLK - #define PPSMC_MSG_SetSoftMaxSocclkByFreq 0x1D ///< Set soft max for SOC CLK #define PPSMC_MSG_SetSoftMaxFclkByFreq 0x1E ///< Set soft max for FCLK #define PPSMC_MSG_SetSoftMaxVcn0 0x1F ///< Set soft max for VCN0 clocks (VCLK0 and DCLK0) -#define PPSMC_MSG_spare_0x20 0x20 +#define PPSMC_MSG_spare_0x20 0x20 ///< Set power limit percentage #define PPSMC_MSG_PowerDownJpeg0 0x21 ///< Power down Jpeg of VCN0 #define PPSMC_MSG_PowerUpJpeg0 0x22 ///< Power up Jpeg of VCN0; VCN0 is power gated by default - #define PPSMC_MSG_SetHardMinFclkByFreq 0x23 ///< Set hard min for FCLK #define PPSMC_MSG_SetSoftMinSocclkByFreq 0x24 ///< Set soft min for SOC CLK #define PPSMC_MSG_AllowZstates 0x25 ///< Inform PMFM of allowing Zstate entry, i.e. no Miracast activity @@ -99,8 +95,8 @@ #define PPSMC_MSG_PowerUpIspByTile 0x2A ///< This message is used to power up ISP tiles and enable the ISP DPM #define PPSMC_MSG_SetHardMinIspiclkByFreq 0x2B ///< Set HardMin by frequency for ISPICLK #define PPSMC_MSG_SetHardMinIspxclkByFreq 0x2C ///< Set HardMin by frequency for ISPXCLK -#define PPSMC_MSG_PowerDownUmsch 0x2D ///< Power down VCN.UMSCH (aka VSCH) scheduler -#define PPSMC_MSG_PowerUpUmsch 0x2E ///< Power up VCN.UMSCH (aka VSCH) scheduler +#define PPSMC_MSG_PowerDownUmsch 0x2D ///< Power down VCN0.UMSCH (aka VSCH) scheduler +#define PPSMC_MSG_PowerUpUmsch 0x2E ///< Power up VCN0.UMSCH (aka VSCH) scheduler #define PPSMC_Message_IspStutterOn_MmhubPgDis 0x2F ///< ISP StutterOn mmHub PgDis #define PPSMC_Message_IspStutterOff_MmhubPgEn 0x30 ///< ISP StufferOff mmHub PgEn #define PPSMC_MSG_PowerUpVpe 0x31 ///< Power up VPE @@ -110,7 +106,9 @@ #define PPSMC_MSG_DisableLSdma 0x35 ///< Disable LSDMA #define PPSMC_MSG_SetSoftMaxVpe 0x36 ///< #define PPSMC_MSG_SetSoftMinVpe 0x37 ///< -#define PPSMC_Message_Count 0x38 ///< Total number of PPSMC messages +#define PPSMC_MSG_AllocMALLCache 0x38 ///< Allocating MALL Cache +#define PPSMC_MSG_ReleaseMALLCache 0x39 ///< Releasing MALL Cache +#define PPSMC_Message_Count 0x3A ///< Total number of PPSMC messages /** @}*/ /** diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h index 3f7463c1c1a9..4af1985ae446 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h @@ -27,6 +27,7 @@ #define SMU14_DRIVER_IF_VERSION_INV 0xFFFFFFFF #define SMU14_DRIVER_IF_VERSION_SMU_V14_0_0 0x7 +#define SMU14_DRIVER_IF_VERSION_SMU_V14_0_1 0x6 #define SMU14_DRIVER_IF_VERSION_SMU_V14_0_2 0x1 #define FEATURE_MASK(feature) (1ULL << feature) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c index 9e39f99154f9..07a65e005785 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c @@ -234,7 +234,7 @@ int smu_v14_0_check_fw_version(struct smu_context *smu) smu->smc_driver_if_version = SMU14_DRIVER_IF_VERSION_SMU_V14_0_0; break; case IP_VERSION(14, 0, 1): - smu->smc_driver_if_version = SMU14_DRIVER_IF_VERSION_SMU_V14_0_0; + smu->smc_driver_if_version = SMU14_DRIVER_IF_VERSION_SMU_V14_0_1; break; default: diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c index d6de6d97286c..63399c00cc28 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c @@ -161,7 +161,7 @@ static int smu_v14_0_0_init_smc_tables(struct smu_context *smu) SMU_TABLE_INIT(tables, SMU_TABLE_WATERMARKS, sizeof(Watermarks_t), PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); - SMU_TABLE_INIT(tables, SMU_TABLE_DPMCLOCKS, sizeof(DpmClocks_t), + SMU_TABLE_INIT(tables, SMU_TABLE_DPMCLOCKS, max(sizeof(DpmClocks_t), sizeof(DpmClocks_t_v14_0_1)), PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); SMU_TABLE_INIT(tables, SMU_TABLE_SMU_METRICS, sizeof(SmuMetrics_t), PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); @@ -171,7 +171,7 @@ static int smu_v14_0_0_init_smc_tables(struct smu_context *smu) goto err0_out; smu_table->metrics_time = 0; - smu_table->clocks_table = kzalloc(sizeof(DpmClocks_t), GFP_KERNEL); + smu_table->clocks_table = kzalloc(max(sizeof(DpmClocks_t), sizeof(DpmClocks_t_v14_0_1)), GFP_KERNEL); if (!smu_table->clocks_table) goto err1_out; @@ -593,6 +593,60 @@ static int smu_v14_0_0_mode2_reset(struct smu_context *smu) return ret; } +static int smu_v14_0_1_get_dpm_freq_by_index(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t dpm_level, + uint32_t *freq) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + + if (!clk_table || clk_type >= SMU_CLK_COUNT) + return -EINVAL; + + switch (clk_type) { + case SMU_SOCCLK: + if (dpm_level >= clk_table->NumSocClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->SocClocks[dpm_level]; + break; + case SMU_VCLK: + if (dpm_level >= clk_table->Vcn0ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->VClocks0[dpm_level]; + break; + case SMU_DCLK: + if (dpm_level >= clk_table->Vcn0ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->DClocks0[dpm_level]; + break; + case SMU_VCLK1: + if (dpm_level >= clk_table->Vcn1ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->VClocks1[dpm_level]; + break; + case SMU_DCLK1: + if (dpm_level >= clk_table->Vcn1ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->DClocks1[dpm_level]; + break; + case SMU_UCLK: + case SMU_MCLK: + if (dpm_level >= clk_table->NumMemPstatesEnabled) + return -EINVAL; + *freq = clk_table->MemPstateTable[dpm_level].MemClk; + break; + case SMU_FCLK: + if (dpm_level >= clk_table->NumFclkLevelsEnabled) + return -EINVAL; + *freq = clk_table->FclkClocks_Freq[dpm_level]; + break; + default: + return -EINVAL; + } + + return 0; +} + static int smu_v14_0_0_get_dpm_freq_by_index(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t dpm_level, @@ -637,6 +691,19 @@ static int smu_v14_0_0_get_dpm_freq_by_index(struct smu_context *smu, return 0; } +static int smu_v14_0_common_get_dpm_freq_by_index(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t dpm_level, + uint32_t *freq) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, dpm_level, freq); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_get_dpm_freq_by_index(smu, clk_type, dpm_level, freq); + + return 0; +} + static bool smu_v14_0_0_clk_dpm_is_enabled(struct smu_context *smu, enum smu_clk_type clk_type) { @@ -657,6 +724,8 @@ static bool smu_v14_0_0_clk_dpm_is_enabled(struct smu_context *smu, break; case SMU_VCLK: case SMU_DCLK: + case SMU_VCLK1: + case SMU_DCLK1: feature_id = SMU_FEATURE_VCN_DPM_BIT; break; default: @@ -666,6 +735,126 @@ static bool smu_v14_0_0_clk_dpm_is_enabled(struct smu_context *smu, return smu_cmn_feature_is_enabled(smu, feature_id); } +static int smu_v14_0_1_get_dpm_ultimate_freq(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *min, + uint32_t *max) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + uint32_t clock_limit; + uint32_t max_dpm_level, min_dpm_level; + int ret = 0; + + if (!smu_v14_0_0_clk_dpm_is_enabled(smu, clk_type)) { + switch (clk_type) { + case SMU_MCLK: + case SMU_UCLK: + clock_limit = smu->smu_table.boot_values.uclk; + break; + case SMU_FCLK: + clock_limit = smu->smu_table.boot_values.fclk; + break; + case SMU_GFXCLK: + case SMU_SCLK: + clock_limit = smu->smu_table.boot_values.gfxclk; + break; + case SMU_SOCCLK: + clock_limit = smu->smu_table.boot_values.socclk; + break; + case SMU_VCLK: + case SMU_VCLK1: + clock_limit = smu->smu_table.boot_values.vclk; + break; + case SMU_DCLK: + case SMU_DCLK1: + clock_limit = smu->smu_table.boot_values.dclk; + break; + default: + clock_limit = 0; + break; + } + + /* clock in Mhz unit */ + if (min) + *min = clock_limit / 100; + if (max) + *max = clock_limit / 100; + + return 0; + } + + if (max) { + switch (clk_type) { + case SMU_GFXCLK: + case SMU_SCLK: + *max = clk_table->MaxGfxClk; + break; + case SMU_MCLK: + case SMU_UCLK: + case SMU_FCLK: + max_dpm_level = 0; + break; + case SMU_SOCCLK: + max_dpm_level = clk_table->NumSocClkLevelsEnabled - 1; + break; + case SMU_VCLK: + case SMU_DCLK: + max_dpm_level = clk_table->Vcn0ClkLevelsEnabled - 1; + break; + case SMU_VCLK1: + case SMU_DCLK1: + max_dpm_level = clk_table->Vcn1ClkLevelsEnabled - 1; + break; + default: + ret = -EINVAL; + goto failed; + } + + if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, max_dpm_level, max); + if (ret) + goto failed; + } + } + + if (min) { + switch (clk_type) { + case SMU_GFXCLK: + case SMU_SCLK: + *min = clk_table->MinGfxClk; + break; + case SMU_MCLK: + case SMU_UCLK: + min_dpm_level = clk_table->NumMemPstatesEnabled - 1; + break; + case SMU_FCLK: + min_dpm_level = clk_table->NumFclkLevelsEnabled - 1; + break; + case SMU_SOCCLK: + min_dpm_level = 0; + break; + case SMU_VCLK: + case SMU_DCLK: + case SMU_VCLK1: + case SMU_DCLK1: + min_dpm_level = 0; + break; + default: + ret = -EINVAL; + goto failed; + } + + if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, min_dpm_level, min); + if (ret) + goto failed; + } + } + +failed: + return ret; +} + static int smu_v14_0_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t *min, @@ -736,7 +925,7 @@ static int smu_v14_0_0_get_dpm_ultimate_freq(struct smu_context *smu, } if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, max_dpm_level, max); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, max_dpm_level, max); if (ret) goto failed; } @@ -768,7 +957,7 @@ static int smu_v14_0_0_get_dpm_ultimate_freq(struct smu_context *smu, } if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, min_dpm_level, min); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, min_dpm_level, min); if (ret) goto failed; } @@ -778,6 +967,19 @@ failed: return ret; } +static int smu_v14_0_common_get_dpm_ultimate_freq(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *min, + uint32_t *max) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_get_dpm_ultimate_freq(smu, clk_type, min, max); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_get_dpm_ultimate_freq(smu, clk_type, min, max); + + return 0; +} + static int smu_v14_0_0_get_current_clk_freq(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t *value) @@ -811,6 +1013,37 @@ static int smu_v14_0_0_get_current_clk_freq(struct smu_context *smu, return smu_v14_0_0_get_smu_metrics_data(smu, member_type, value); } +static int smu_v14_0_1_get_dpm_level_count(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *count) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + + switch (clk_type) { + case SMU_SOCCLK: + *count = clk_table->NumSocClkLevelsEnabled; + break; + case SMU_VCLK: + case SMU_DCLK: + *count = clk_table->Vcn0ClkLevelsEnabled; + break; + case SMU_VCLK1: + case SMU_DCLK1: + *count = clk_table->Vcn1ClkLevelsEnabled; + break; + case SMU_MCLK: + *count = clk_table->NumMemPstatesEnabled; + break; + case SMU_FCLK: + *count = clk_table->NumFclkLevelsEnabled; + break; + default: + break; + } + + return 0; +} + static int smu_v14_0_0_get_dpm_level_count(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t *count) @@ -840,6 +1073,18 @@ static int smu_v14_0_0_get_dpm_level_count(struct smu_context *smu, return 0; } +static int smu_v14_0_common_get_dpm_level_count(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *count) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_get_dpm_level_count(smu, clk_type, count); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_get_dpm_level_count(smu, clk_type, count); + + return 0; +} + static int smu_v14_0_0_print_clk_levels(struct smu_context *smu, enum smu_clk_type clk_type, char *buf) { @@ -866,18 +1111,20 @@ static int smu_v14_0_0_print_clk_levels(struct smu_context *smu, case SMU_SOCCLK: case SMU_VCLK: case SMU_DCLK: + case SMU_VCLK1: + case SMU_DCLK1: case SMU_MCLK: case SMU_FCLK: ret = smu_v14_0_0_get_current_clk_freq(smu, clk_type, &cur_value); if (ret) break; - ret = smu_v14_0_0_get_dpm_level_count(smu, clk_type, &count); + ret = smu_v14_0_common_get_dpm_level_count(smu, clk_type, &count); if (ret) break; for (i = 0; i < count; i++) { - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, i, &value); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, i, &value); if (ret) break; @@ -940,8 +1187,13 @@ static int smu_v14_0_0_set_soft_freq_limited_range(struct smu_context *smu, break; case SMU_VCLK: case SMU_DCLK: - msg_set_min = SMU_MSG_SetHardMinVcn; - msg_set_max = SMU_MSG_SetSoftMaxVcn; + msg_set_min = SMU_MSG_SetHardMinVcn0; + msg_set_max = SMU_MSG_SetSoftMaxVcn0; + break; + case SMU_VCLK1: + case SMU_DCLK1: + msg_set_min = SMU_MSG_SetHardMinVcn1; + msg_set_max = SMU_MSG_SetSoftMaxVcn1; break; default: return -EINVAL; @@ -971,11 +1223,11 @@ static int smu_v14_0_0_force_clk_levels(struct smu_context *smu, case SMU_FCLK: case SMU_VCLK: case SMU_DCLK: - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, soft_min_level, &min_freq); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, soft_min_level, &min_freq); if (ret) break; - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, soft_max_level, &max_freq); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, soft_max_level, &max_freq); if (ret) break; @@ -1000,25 +1252,25 @@ static int smu_v14_0_0_set_performance_level(struct smu_context *smu, switch (level) { case AMD_DPM_FORCED_LEVEL_HIGH: - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SCLK, NULL, &sclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_FCLK, NULL, &fclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SOCCLK, NULL, &socclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, NULL, &sclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, NULL, &fclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, NULL, &socclk_max); sclk_min = sclk_max; fclk_min = fclk_max; socclk_min = socclk_max; break; case AMD_DPM_FORCED_LEVEL_LOW: - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, NULL); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, NULL); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, NULL); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, NULL); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, NULL); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, NULL); sclk_max = sclk_min; fclk_max = fclk_min; socclk_max = socclk_min; break; case AMD_DPM_FORCED_LEVEL_AUTO: - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, &sclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, &fclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, &socclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, &sclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, &fclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, &socclk_max); break; case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD: case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_SCLK: @@ -1067,6 +1319,18 @@ static int smu_v14_0_0_set_performance_level(struct smu_context *smu, return ret; } +static int smu_v14_0_1_set_fine_grain_gfx_freq_parameters(struct smu_context *smu) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + + smu->gfx_default_hard_min_freq = clk_table->MinGfxClk; + smu->gfx_default_soft_max_freq = clk_table->MaxGfxClk; + smu->gfx_actual_hard_min_freq = 0; + smu->gfx_actual_soft_max_freq = 0; + + return 0; +} + static int smu_v14_0_0_set_fine_grain_gfx_freq_parameters(struct smu_context *smu) { DpmClocks_t *clk_table = smu->smu_table.clocks_table; @@ -1079,6 +1343,16 @@ static int smu_v14_0_0_set_fine_grain_gfx_freq_parameters(struct smu_context *sm return 0; } +static int smu_v14_0_common_set_fine_grain_gfx_freq_parameters(struct smu_context *smu) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_set_fine_grain_gfx_freq_parameters(smu); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_set_fine_grain_gfx_freq_parameters(smu); + + return 0; +} + static int smu_v14_0_0_set_vpe_enable(struct smu_context *smu, bool enable) { @@ -1095,6 +1369,25 @@ static int smu_v14_0_0_set_umsch_mm_enable(struct smu_context *smu, 0, NULL); } +static int smu_14_0_1_get_dpm_table(struct smu_context *smu, struct dpm_clocks *clock_table) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + uint8_t idx; + + /* Only the Clock information of SOC and VPE is copied to provide VPE DPM settings for use. */ + for (idx = 0; idx < NUM_SOCCLK_DPM_LEVELS; idx++) { + clock_table->SocClocks[idx].Freq = (idx < clk_table->NumSocClkLevelsEnabled) ? clk_table->SocClocks[idx]:0; + clock_table->SocClocks[idx].Vol = 0; + } + + for (idx = 0; idx < NUM_VPE_DPM_LEVELS; idx++) { + clock_table->VPEClocks[idx].Freq = (idx < clk_table->VpeClkLevelsEnabled) ? clk_table->VPEClocks[idx]:0; + clock_table->VPEClocks[idx].Vol = 0; + } + + return 0; +} + static int smu_14_0_0_get_dpm_table(struct smu_context *smu, struct dpm_clocks *clock_table) { DpmClocks_t *clk_table = smu->smu_table.clocks_table; @@ -1114,6 +1407,16 @@ static int smu_14_0_0_get_dpm_table(struct smu_context *smu, struct dpm_clocks * return 0; } +static int smu_v14_0_common_get_dpm_table(struct smu_context *smu, struct dpm_clocks *clock_table) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_14_0_0_get_dpm_table(smu, clock_table); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_14_0_1_get_dpm_table(smu, clock_table); + + return 0; +} + static const struct pptable_funcs smu_v14_0_0_ppt_funcs = { .check_fw_status = smu_v14_0_check_fw_status, .check_fw_version = smu_v14_0_check_fw_version, @@ -1135,16 +1438,16 @@ static const struct pptable_funcs smu_v14_0_0_ppt_funcs = { .set_driver_table_location = smu_v14_0_set_driver_table_location, .gfx_off_control = smu_v14_0_gfx_off_control, .mode2_reset = smu_v14_0_0_mode2_reset, - .get_dpm_ultimate_freq = smu_v14_0_0_get_dpm_ultimate_freq, + .get_dpm_ultimate_freq = smu_v14_0_common_get_dpm_ultimate_freq, .od_edit_dpm_table = smu_v14_0_od_edit_dpm_table, .print_clk_levels = smu_v14_0_0_print_clk_levels, .force_clk_levels = smu_v14_0_0_force_clk_levels, .set_performance_level = smu_v14_0_0_set_performance_level, - .set_fine_grain_gfx_freq_parameters = smu_v14_0_0_set_fine_grain_gfx_freq_parameters, + .set_fine_grain_gfx_freq_parameters = smu_v14_0_common_set_fine_grain_gfx_freq_parameters, .set_gfx_power_up_by_imu = smu_v14_0_set_gfx_power_up_by_imu, .dpm_set_vpe_enable = smu_v14_0_0_set_vpe_enable, .dpm_set_umsch_mm_enable = smu_v14_0_0_set_umsch_mm_enable, - .get_dpm_clock_table = smu_14_0_0_get_dpm_table, + .get_dpm_clock_table = smu_v14_0_common_get_dpm_table, }; static void smu_v14_0_0_set_smu_mailbox_registers(struct smu_context *smu) From 533eefb9be76c3b23d220ee18edfda8eb56cefff Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Tue, 12 Dec 2023 17:17:05 +0800 Subject: [PATCH 113/270] drm/amdgpu: add smu 14.0.1 discovery support This patch to add smu 14.0.1 support Reviewed-by: Alex Deucher Signed-off-by: Yifan Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index fdd36fb027ab..ac5bf01fe8d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -1896,6 +1896,7 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &smu_v13_0_ip_block); break; case IP_VERSION(14, 0, 0): + case IP_VERSION(14, 0, 1): amdgpu_device_ip_block_add(adev, &smu_v14_0_ip_block); break; default: From f886b49feaae30acd599e37d4284836024b0f3ed Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 28 Mar 2024 18:22:10 +0800 Subject: [PATCH 114/270] drm/amdgpu: implement IRQ_STATE_ENABLE for SDMA v4.4.2 SDMA_CNTL is not set in some cases, driver configures it by itself. v2: simplify code Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 34237a1b1f2e..82eab49be82b 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1602,19 +1602,9 @@ static int sdma_v4_4_2_set_ecc_irq_state(struct amdgpu_device *adev, u32 sdma_cntl; sdma_cntl = RREG32_SDMA(type, regSDMA_CNTL); - switch (state) { - case AMDGPU_IRQ_STATE_DISABLE: - sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, - DRAM_ECC_INT_ENABLE, 0); - WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); - break; - /* sdma ecc interrupt is enabled by default - * driver doesn't need to do anything to - * enable the interrupt */ - case AMDGPU_IRQ_STATE_ENABLE: - default: - break; - } + sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, DRAM_ECC_INT_ENABLE, + state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); + WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); return 0; } From ecedd99a9369fb5cde601ae9abd58bca2739f1ae Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Fri, 15 Mar 2024 21:25:25 -0600 Subject: [PATCH 115/270] drm/amd/display: Skip on writeback when it's not applicable [WHY] dynamic memory safety error detector (KASAN) catches and generates error messages "BUG: KASAN: slab-out-of-bounds" as writeback connector does not support certain features which are not initialized. [HOW] Skip them when connector type is DRM_MODE_CONNECTOR_WRITEBACK. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3199 Reviewed-by: Harry Wentland Reviewed-by: Rodrigo Siqueira Acked-by: Roman Li Signed-off-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 173438fc7537..6cf0d5f276fe 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3047,6 +3047,10 @@ static int dm_resume(void *handle) /* Do mst topology probing after resuming cached state*/ drm_connector_list_iter_begin(ddev, &iter); drm_for_each_connector_iter(connector, &iter) { + + if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK) + continue; + aconnector = to_amdgpu_dm_connector(connector); if (aconnector->dc_link->type != dc_connection_mst_branch || aconnector->mst_root) @@ -5926,6 +5930,9 @@ get_highest_refresh_rate_mode(struct amdgpu_dm_connector *aconnector, &aconnector->base.probed_modes : &aconnector->base.modes; + if (aconnector->base.connector_type == DRM_MODE_CONNECTOR_WRITEBACK) + return NULL; + if (aconnector->freesync_vid_base.clock != 0) return &aconnector->freesync_vid_base; @@ -8767,10 +8774,10 @@ static void amdgpu_dm_commit_audio(struct drm_device *dev, if (!drm_atomic_crtc_needs_modeset(new_crtc_state)) continue; +notify: if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK) continue; -notify: aconnector = to_amdgpu_dm_connector(connector); mutex_lock(&adev->dm.audio_lock); From 3818708e9c9712e2ba4006bc23502ee7b031bd3f Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Thu, 28 Mar 2024 11:00:50 +0800 Subject: [PATCH 116/270] drm/amd/pm: fix the high voltage issue after unload fix the high voltage issue after unload on smu 13.0.10 Signed-off-by: Kenneth Feng Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 26 ++++++++++-------- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 27 +++++++++++++++++-- drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 1 + .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 8 +++++- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index aa16d51dd842..7753a2e64d41 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4135,18 +4135,22 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->ip_blocks[i].status.hw = true; } } + } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && + !amdgpu_device_has_display_hardware(adev)) { + r = psp_gpu_reset(adev); } else { - tmp = amdgpu_reset_method; - /* It should do a default reset when loading or reloading the driver, - * regardless of the module parameter reset_method. - */ - amdgpu_reset_method = AMD_RESET_METHOD_NONE; - r = amdgpu_asic_reset(adev); - amdgpu_reset_method = tmp; - if (r) { - dev_err(adev->dev, "asic reset on init failed\n"); - goto failed; - } + tmp = amdgpu_reset_method; + /* It should do a default reset when loading or reloading the driver, + * regardless of the module parameter reset_method. + */ + amdgpu_reset_method = AMD_RESET_METHOD_NONE; + r = amdgpu_asic_reset(adev); + amdgpu_reset_method = tmp; + } + + if (r) { + dev_err(adev->dev, "asic reset on init failed\n"); + goto failed; } } diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 246b211b1e85..65333141b1c1 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -735,7 +735,7 @@ static int smu_early_init(void *handle) smu->adev = adev; smu->pm_enabled = !!amdgpu_dpm; smu->is_apu = false; - smu->smu_baco.state = SMU_BACO_STATE_EXIT; + smu->smu_baco.state = SMU_BACO_STATE_NONE; smu->smu_baco.platform_support = false; smu->user_dpm_profile.fan_mode = -1; @@ -1966,10 +1966,25 @@ static int smu_smc_hw_cleanup(struct smu_context *smu) return 0; } +static int smu_reset_mp1_state(struct smu_context *smu) +{ + struct amdgpu_device *adev = smu->adev; + int ret = 0; + + if ((!adev->in_runpm) && (!adev->in_suspend) && + (!amdgpu_in_reset(adev)) && amdgpu_ip_version(adev, MP1_HWIP, 0) == + IP_VERSION(13, 0, 10) && + !amdgpu_device_has_display_hardware(adev)) + ret = smu_set_mp1_state(smu, PP_MP1_STATE_UNLOAD); + + return ret; +} + static int smu_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct smu_context *smu = adev->powerplay.pp_handle; + int ret; if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) return 0; @@ -1987,7 +2002,15 @@ static int smu_hw_fini(void *handle) adev->pm.dpm_enabled = false; - return smu_smc_hw_cleanup(smu); + ret = smu_smc_hw_cleanup(smu); + if (ret) + return ret; + + ret = smu_reset_mp1_state(smu); + if (ret) + return ret; + + return 0; } static void smu_late_fini(void *handle) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index a870bdd49a4e..1fa81575788c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -424,6 +424,7 @@ enum smu_reset_mode { enum smu_baco_state { SMU_BACO_STATE_ENTER = 0, SMU_BACO_STATE_EXIT, + SMU_BACO_STATE_NONE, }; struct smu_baco_context { diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 9c03296f92cd..67117ced7c6a 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2751,7 +2751,13 @@ static int smu_v13_0_0_set_mp1_state(struct smu_context *smu, switch (mp1_state) { case PP_MP1_STATE_UNLOAD: - ret = smu_cmn_set_mp1_state(smu, mp1_state); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_PrepareMp1ForUnload, + 0x55, NULL); + + if (!ret && smu->smu_baco.state == SMU_BACO_STATE_EXIT) + ret = smu_v13_0_disable_pmfw_state(smu); + break; default: /* Ignore others */ From f7e232de51bb1b45646e5b7dc4ebcf13510f2630 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 6 Mar 2024 17:05:07 +0530 Subject: [PATCH 117/270] drm/amdgpu: Fix VCN allocation in CPX partition VCN need not be shared in CPX mode always for all GFX 9.4.3 SOC SKUs. In certain configs, VCN instance can be exclusively allocated to a partition even under CPX mode. Signed-off-by: Lijo Lazar Reviewed-by: James Zhu Reviewed-by: Asad Kamal Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index d6f808acfb17..fbb43ae7624f 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -62,6 +62,11 @@ void aqua_vanjaram_doorbell_index_init(struct amdgpu_device *adev) adev->doorbell_index.max_assignment = AMDGPU_DOORBELL_LAYOUT1_MAX_ASSIGNMENT << 1; } +static bool aqua_vanjaram_xcp_vcn_shared(struct amdgpu_device *adev) +{ + return (adev->xcp_mgr->num_xcps > adev->vcn.num_vcn_inst); +} + static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev, uint32_t inst_idx, struct amdgpu_ring *ring) { @@ -87,7 +92,7 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev, case AMDGPU_RING_TYPE_VCN_ENC: case AMDGPU_RING_TYPE_VCN_JPEG: ip_blk = AMDGPU_XCP_VCN; - if (adev->xcp_mgr->mode == AMDGPU_CPX_PARTITION_MODE) + if (aqua_vanjaram_xcp_vcn_shared(adev)) inst_mask = 1 << (inst_idx * 2); break; default: @@ -140,10 +145,12 @@ static int aqua_vanjaram_xcp_sched_list_update( aqua_vanjaram_xcp_gpu_sched_update(adev, ring, ring->xcp_id); - /* VCN is shared by two partitions under CPX MODE */ + /* VCN may be shared by two partitions under CPX MODE in certain + * configs. + */ if ((ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC || - ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG) && - adev->xcp_mgr->mode == AMDGPU_CPX_PARTITION_MODE) + ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG) && + aqua_vanjaram_xcp_vcn_shared(adev)) aqua_vanjaram_xcp_gpu_sched_update(adev, ring, ring->xcp_id + 1); } From e33997e18d0fddd217a0fce988abbfd015338631 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin Date: Tue, 2 Apr 2024 11:41:05 +0800 Subject: [PATCH 118/270] drm/amdgpu: clear set_q_mode_offs when VM changed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] set_q_mode_offs don't get cleared after GPU reset, nexting SET_Q_MODE packet to init shadow memory will be skiped, hence there has a page fault. [How] VM flush is needed after GPU reset, clear set_q_mode_offs when emitting VM flush. Fixes: 8bc75586ea01 ("drm/amdgpu: workaround to avoid SET_Q_MODE packets v2") Reviewed-by: Christian König Signed-off-by: ZhenGuo Yin Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 1770e496c1b7..2c82e2741619 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -5465,6 +5465,7 @@ static void gfx_v11_0_ring_emit_vm_flush(struct amdgpu_ring *ring, /* Make sure that we can't skip the SET_Q_MODE packets when the VM * changed in any way. */ + ring->set_q_mode_offs = 0; ring->set_q_mode_ptr = NULL; } From d06af584be5a769d124b7302b32a033e9559761d Mon Sep 17 00:00:00 2001 From: Zhigang Luo Date: Mon, 18 Mar 2024 14:13:10 -0400 Subject: [PATCH 119/270] amd/amdkfd: sync all devices to wait all processes being evicted If there are more than one device doing reset in parallel, the first device will call kfd_suspend_all_processes() to evict all processes on all devices, this call takes time to finish. other device will start reset and recover without waiting. if the process has not been evicted before doing recover, it will be restored, then caused page fault. Signed-off-by: Zhigang Luo Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 041ec3de55e7..719d6d365e15 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -960,7 +960,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) { struct kfd_node *node; int i; - int count; if (!kfd->init_complete) return; @@ -968,12 +967,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) /* for runtime suspend, skip locking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = ++kfd_locked; - mutex_unlock(&kfd_processes_mutex); - /* For first KFD device suspend all the KFD processes */ - if (count == 1) + if (++kfd_locked == 1) kfd_suspend_all_processes(); + mutex_unlock(&kfd_processes_mutex); } for (i = 0; i < kfd->num_nodes; i++) { @@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) { - int ret, count, i; + int ret, i; if (!kfd->init_complete) return 0; @@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) /* for runtime resume, skip unlocking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = --kfd_locked; - mutex_unlock(&kfd_processes_mutex); - - WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); - if (count == 0) + if (--kfd_locked == 0) ret = kfd_resume_all_processes(); + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); + mutex_unlock(&kfd_processes_mutex); } return ret; From 2cc69a10d83180f3de9f5afe3a98e972b1453d4c Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Sat, 23 Mar 2024 12:02:54 -0600 Subject: [PATCH 120/270] drm/amd/display: Return max resolution supported by DWB mode_config's max width x height is 4096x2160 and is higher than DWB's max resolution 3840x2160 which is returned instead. Cc: stable@vger.kernel.org Reviewed-by: Harry Wentland Acked-by: Hamza Mahfooz Signed-off-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c index 16e72d623630..08c494a7a21b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c @@ -76,10 +76,8 @@ static int amdgpu_dm_wb_encoder_atomic_check(struct drm_encoder *encoder, static int amdgpu_dm_wb_connector_get_modes(struct drm_connector *connector) { - struct drm_device *dev = connector->dev; - - return drm_add_modes_noedid(connector, dev->mode_config.max_width, - dev->mode_config.max_height); + /* Maximum resolution supported by DWB */ + return drm_add_modes_noedid(connector, 3840, 2160); } static int amdgpu_dm_wb_prepare_job(struct drm_writeback_connector *wb_connector, From bbca7f414ae9a12ea231cdbafd79c607e3337ea8 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Wed, 3 Apr 2024 17:28:44 +0800 Subject: [PATCH 121/270] drm/amdgpu: fix incorrect number of active RBs for gfx11 The RB bitmap should be global active RB bitmap & active RB bitmap based on active SA. Signed-off-by: Tim Huang Reviewed-by: Yifan Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 2c82e2741619..f7325b02a191 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1635,7 +1635,7 @@ static void gfx_v11_0_setup_rb(struct amdgpu_device *adev) active_rb_bitmap |= (0x3 << (i * rb_bitmap_width_per_sa)); } - active_rb_bitmap |= global_active_rb_bitmap; + active_rb_bitmap &= global_active_rb_bitmap; adev->gfx.config.backend_enable_mask = active_rb_bitmap; adev->gfx.config.num_rbs = hweight32(active_rb_bitmap); } From 81901d8d0472e9a19d294ae1dea76b950548195d Mon Sep 17 00:00:00 2001 From: Wenjing Liu Date: Fri, 22 Mar 2024 15:02:45 -0400 Subject: [PATCH 122/270] drm/amd/display: always reset ODM mode in context when adding first plane [why] In current implemenation ODM mode is only reset when the last plane is removed from dc state. For any dc validate we will always remove all current planes and add new planes. However when switching from no planes to 1 plane, ODM mode is not reset because no planes get removed. This has caused an issue where we kept ODM combine when it should have been remove when a plane is added. The change is to reset ODM mode when adding the first plane. Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Acked-by: Hamza Mahfooz Signed-off-by: Wenjing Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_state.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/drivers/gpu/drm/amd/display/dc/core/dc_state.c index 5cc7f8da209c..61986e5cb491 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c @@ -436,6 +436,15 @@ bool dc_state_add_plane( goto out; } + if (stream_status->plane_count == 0 && dc->config.enable_windowed_mpo_odm) + /* ODM combine could prevent us from supporting more planes + * we will reset ODM slice count back to 1 when all planes have + * been removed to maximize the amount of planes supported when + * new planes are added. + */ + resource_update_pipes_for_stream_with_slice_count( + state, dc->current_state, dc->res_pool, stream, 1); + otg_master_pipe = resource_get_otg_master_for_stream( &state->res_ctx, stream); if (otg_master_pipe) From 953927587f37b731abdeabe46ad44a3b3ec67a52 Mon Sep 17 00:00:00 2001 From: Dillon Varone Date: Thu, 21 Mar 2024 13:49:43 -0400 Subject: [PATCH 123/270] drm/amd/display: Do not recursively call manual trigger programming [WHY&HOW] We should not be recursively calling the manual trigger programming function when FAMS is not in use. Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Acked-by: Hamza Mahfooz Signed-off-by: Dillon Varone Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c index f07a4c7e48bc..52eab8fccb7f 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c @@ -267,9 +267,6 @@ static void optc32_setup_manual_trigger(struct timing_generator *optc) OTG_V_TOTAL_MAX_SEL, 1, OTG_FORCE_LOCK_ON_EVENT, 0, OTG_SET_V_TOTAL_MIN_MASK, (1 << 1)); /* TRIGA */ - - // Setup manual flow control for EOF via TRIG_A - optc->funcs->setup_manual_trigger(optc); } } From cf79814cb0bf5749b9f0db53ca231aa540c02768 Mon Sep 17 00:00:00 2001 From: Fudongwang Date: Tue, 26 Mar 2024 16:03:16 +0800 Subject: [PATCH 124/270] drm/amd/display: fix disable otg wa logic in DCN316 [Why] Wrong logic cause screen corruption. [How] Port logic from DCN35/314. Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Acked-by: Hamza Mahfooz Signed-off-by: Fudongwang Signed-off-by: Alex Deucher --- .../dc/clk_mgr/dcn316/dcn316_clk_mgr.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c index 12f3e8aa46d8..6ad4f4efec5d 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c @@ -99,20 +99,25 @@ static int dcn316_get_active_display_cnt_wa( return display_count; } -static void dcn316_disable_otg_wa(struct clk_mgr *clk_mgr_base, struct dc_state *context, bool disable) +static void dcn316_disable_otg_wa(struct clk_mgr *clk_mgr_base, struct dc_state *context, + bool safe_to_lower, bool disable) { struct dc *dc = clk_mgr_base->ctx->dc; int i; for (i = 0; i < dc->res_pool->pipe_count; ++i) { - struct pipe_ctx *pipe = &dc->current_state->res_ctx.pipe_ctx[i]; + struct pipe_ctx *pipe = safe_to_lower + ? &context->res_ctx.pipe_ctx[i] + : &dc->current_state->res_ctx.pipe_ctx[i]; if (pipe->top_pipe || pipe->prev_odm_pipe) continue; - if (pipe->stream && (pipe->stream->dpms_off || pipe->plane_state == NULL || - dc_is_virtual_signal(pipe->stream->signal))) { + if (pipe->stream && (pipe->stream->dpms_off || dc_is_virtual_signal(pipe->stream->signal) || + !pipe->stream->link_enc)) { if (disable) { - pipe->stream_res.tg->funcs->immediate_disable_crtc(pipe->stream_res.tg); + if (pipe->stream_res.tg && pipe->stream_res.tg->funcs->immediate_disable_crtc) + pipe->stream_res.tg->funcs->immediate_disable_crtc(pipe->stream_res.tg); + reset_sync_context_for_pipe(dc, context, i); } else pipe->stream_res.tg->funcs->enable_crtc(pipe->stream_res.tg); @@ -207,11 +212,11 @@ static void dcn316_update_clocks(struct clk_mgr *clk_mgr_base, } if (should_set_clock(safe_to_lower, new_clocks->dispclk_khz, clk_mgr_base->clks.dispclk_khz)) { - dcn316_disable_otg_wa(clk_mgr_base, context, true); + dcn316_disable_otg_wa(clk_mgr_base, context, safe_to_lower, true); clk_mgr_base->clks.dispclk_khz = new_clocks->dispclk_khz; dcn316_smu_set_dispclk(clk_mgr, clk_mgr_base->clks.dispclk_khz); - dcn316_disable_otg_wa(clk_mgr_base, context, false); + dcn316_disable_otg_wa(clk_mgr_base, context, safe_to_lower, false); update_dispclk = true; } From 9e61ef8d219877202d4ee51d0d2ad9072c99a262 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Tue, 12 Mar 2024 11:55:52 -0400 Subject: [PATCH 125/270] drm/amd/display: Program VSC SDP colorimetry for all DP sinks >= 1.4 In order for display colorimetry to work correctly on DP displays we need to send the VSC SDP packet. We should only do so for panels with DPCD revision greater or equal to 1.4 as older receivers might have problems with it. Cc: stable@vger.kernel.org Cc: Joshua Ashton Cc: Xaver Hugl Cc: Melissa Wen Cc: Agustin Gutierrez Reviewed-by: Agustin Gutierrez Acked-by: Hamza Mahfooz Signed-off-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 6cf0d5f276fe..d112ce6c812c 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6318,7 +6318,9 @@ create_stream_for_sink(struct drm_connector *connector, if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) mod_build_hf_vsif_infopacket(stream, &stream->vsp_infopacket); - if (stream->link->psr_settings.psr_feature_enabled || stream->link->replay_settings.replay_feature_enabled) { + if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT || + stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST || + stream->signal == SIGNAL_TYPE_EDP) { // // should decide stream support vsc sdp colorimetry capability // before building vsc info packet @@ -6328,7 +6330,8 @@ create_stream_for_sink(struct drm_connector *connector, stream->use_vsc_sdp_for_colorimetry = aconnector->dc_sink->is_vsc_sdp_colorimetry_supported; } else { - if (stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED) + if (stream->link->dpcd_caps.dpcd_rev.raw >= 0x14 && + stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED) stream->use_vsc_sdp_for_colorimetry = true; } if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22) From c3e2a5f2da904a18661335e8be2b961738574998 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Thu, 21 Mar 2024 11:13:38 -0400 Subject: [PATCH 126/270] drm/amd/display: Set VSC SDP Colorimetry same way for MST and SST The previous check for the is_vsc_sdp_colorimetry_supported flag for MST sink signals did nothing. Simplify the code and use the same check for MST and SST. Cc: stable@vger.kernel.org Reviewed-by: Agustin Gutierrez Acked-by: Hamza Mahfooz Signed-off-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index d112ce6c812c..6d2f60c61dec 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6325,15 +6325,9 @@ create_stream_for_sink(struct drm_connector *connector, // should decide stream support vsc sdp colorimetry capability // before building vsc info packet // - stream->use_vsc_sdp_for_colorimetry = false; - if (aconnector->dc_sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT_MST) { - stream->use_vsc_sdp_for_colorimetry = - aconnector->dc_sink->is_vsc_sdp_colorimetry_supported; - } else { - if (stream->link->dpcd_caps.dpcd_rev.raw >= 0x14 && - stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED) - stream->use_vsc_sdp_for_colorimetry = true; - } + stream->use_vsc_sdp_for_colorimetry = stream->link->dpcd_caps.dpcd_rev.raw >= 0x14 && + stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED; + if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22) tf = TRANSFER_FUNC_GAMMA_22; mod_build_vsc_infopacket(stream, &stream->vsc_infopacket, stream->output_color_space, tf); From e047dd448d2bc12b8c30d7e3e6e98cea1fc28a17 Mon Sep 17 00:00:00 2001 From: Zhongwei Date: Wed, 27 Mar 2024 13:49:40 +0800 Subject: [PATCH 127/270] drm/amd/display: Adjust dprefclk by down spread percentage. [Why] OLED panels show no display for large vtotal timings. [How] Check if ss is enabled and read from lut for spread spectrum percentage. Adjust dprefclk as required. DP_DTO adjustment is for edp only. Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Acked-by: Hamza Mahfooz Signed-off-by: Zhongwei Signed-off-by: Alex Deucher --- .../display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c | 50 +++++++++++++++++++ .../drm/amd/display/dc/dce/dce_clock_source.c | 8 +-- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c index 101fe96287cb..d9c5692c86c2 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c @@ -73,6 +73,12 @@ #define CLK1_CLK2_BYPASS_CNTL__CLK2_BYPASS_SEL_MASK 0x00000007L #define CLK1_CLK2_BYPASS_CNTL__CLK2_BYPASS_DIV_MASK 0x000F0000L +#define regCLK5_0_CLK5_spll_field_8 0x464b +#define regCLK5_0_CLK5_spll_field_8_BASE_IDX 0 + +#define CLK5_0_CLK5_spll_field_8__spll_ssc_en__SHIFT 0xd +#define CLK5_0_CLK5_spll_field_8__spll_ssc_en_MASK 0x00002000L + #define SMU_VER_THRESHOLD 0x5D4A00 //93.74.0 #define REG(reg_name) \ @@ -411,6 +417,17 @@ static void dcn35_dump_clk_registers(struct clk_state_registers_and_bypass *regs { } +static bool dcn35_is_spll_ssc_enabled(struct clk_mgr *clk_mgr_base) +{ + struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base); + struct dc_context *ctx = clk_mgr->base.ctx; + uint32_t ssc_enable; + + REG_GET(CLK5_0_CLK5_spll_field_8, spll_ssc_en, &ssc_enable); + + return ssc_enable == 1; +} + static void init_clk_states(struct clk_mgr *clk_mgr) { struct clk_mgr_internal *clk_mgr_int = TO_CLK_MGR_INTERNAL(clk_mgr); @@ -428,7 +445,16 @@ static void init_clk_states(struct clk_mgr *clk_mgr) void dcn35_init_clocks(struct clk_mgr *clk_mgr) { + struct clk_mgr_internal *clk_mgr_int = TO_CLK_MGR_INTERNAL(clk_mgr); init_clk_states(clk_mgr); + + // to adjust dp_dto reference clock if ssc is enable otherwise to apply dprefclk + if (dcn35_is_spll_ssc_enabled(clk_mgr)) + clk_mgr->dp_dto_source_clock_in_khz = + dce_adjust_dp_ref_freq_for_ss(clk_mgr_int, clk_mgr->dprefclk_khz); + else + clk_mgr->dp_dto_source_clock_in_khz = clk_mgr->dprefclk_khz; + } static struct clk_bw_params dcn35_bw_params = { .vram_type = Ddr4MemType, @@ -517,6 +543,28 @@ static DpmClocks_t_dcn35 dummy_clocks; static struct dcn35_watermarks dummy_wms = { 0 }; +static struct dcn35_ss_info_table ss_info_table = { + .ss_divider = 1000, + .ss_percentage = {0, 0, 375, 375, 375} +}; + +static void dcn35_read_ss_info_from_lut(struct clk_mgr_internal *clk_mgr) +{ + struct dc_context *ctx = clk_mgr->base.ctx; + uint32_t clock_source; + + REG_GET(CLK1_CLK2_BYPASS_CNTL, CLK2_BYPASS_SEL, &clock_source); + // If it's DFS mode, clock_source is 0. + if (dcn35_is_spll_ssc_enabled(&clk_mgr->base) && (clock_source < ARRAY_SIZE(ss_info_table.ss_percentage))) { + clk_mgr->dprefclk_ss_percentage = ss_info_table.ss_percentage[clock_source]; + + if (clk_mgr->dprefclk_ss_percentage != 0) { + clk_mgr->ss_on_dprefclk = true; + clk_mgr->dprefclk_ss_divider = ss_info_table.ss_divider; + } + } +} + static void dcn35_build_watermark_ranges(struct clk_bw_params *bw_params, struct dcn35_watermarks *table) { int i, num_valid_sets; @@ -1061,6 +1109,8 @@ void dcn35_clk_mgr_construct( dce_clock_read_ss_info(&clk_mgr->base); /*when clk src is from FCH, it could have ss, same clock src as DPREF clk*/ + dcn35_read_ss_info_from_lut(&clk_mgr->base); + clk_mgr->base.base.bw_params = &dcn35_bw_params; if (clk_mgr->base.base.ctx->dc->debug.pstate_enabled) { diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c b/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c index 970644b695cd..b5e0289d2fe8 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c @@ -976,7 +976,10 @@ static bool dcn31_program_pix_clk( struct bp_pixel_clock_parameters bp_pc_params = {0}; enum transmitter_color_depth bp_pc_colour_depth = TRANSMITTER_COLOR_DEPTH_24; - if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0) + // Apply ssed(spread spectrum) dpref clock for edp only. + if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0 + && pix_clk_params->signal_type == SIGNAL_TYPE_EDP + && encoding == DP_8b_10b_ENCODING) dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz; // For these signal types Driver to program DP_DTO without calling VBIOS Command table if (dc_is_dp_signal(pix_clk_params->signal_type) || dc_is_virtual_signal(pix_clk_params->signal_type)) { @@ -1093,9 +1096,6 @@ static bool get_pixel_clk_frequency_100hz( unsigned int modulo_hz = 0; unsigned int dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dprefclk_khz; - if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0) - dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz; - if (clock_source->id == CLOCK_SOURCE_ID_DP_DTO) { clock_hz = REG_READ(PHASE[inst]); From 6dba20d23e85034901ccb765a7ca71199bcca4df Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Sun, 7 Apr 2024 22:01:35 +0800 Subject: [PATCH 128/270] drm/amdgpu: differentiate external rev id for gfx 11.5.0 This patch to differentiate external rev id for gfx 11.5.0. Signed-off-by: Yifan Zhang Reviewed-by: Tim Huang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc21.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index abe319b0f063..43ca63fe85ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -720,7 +720,10 @@ static int soc21_common_early_init(void *handle) AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_JPEG | AMD_PG_SUPPORT_GFX_PG; - adev->external_rev_id = adev->rev_id + 0x1; + if (adev->rev_id == 0) + adev->external_rev_id = 0x1; + else + adev->external_rev_id = adev->rev_id + 0x10; break; case IP_VERSION(11, 5, 1): adev->cg_flags = From dec8ced871e17eea46f097542dd074d022be4bd1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 5 Mar 2024 22:10:03 -0800 Subject: [PATCH 129/270] perf/x86: Fix out of range data On x86 each struct cpu_hw_events maintains a table for counter assignment but it missed to update one for the deleted event in x86_pmu_del(). This can make perf_clear_dirty_counters() reset used counter if it's called before event scheduling or enabling. Then it would return out of range data which doesn't make sense. The following code can reproduce the problem. $ cat repro.c #include #include #include #include #include #include #include #include struct perf_event_attr attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, .disabled = 1, }; void *worker(void *arg) { int cpu = (long)arg; int fd1 = syscall(SYS_perf_event_open, &attr, -1, cpu, -1, 0); int fd2 = syscall(SYS_perf_event_open, &attr, -1, cpu, -1, 0); void *p; do { ioctl(fd1, PERF_EVENT_IOC_ENABLE, 0); p = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd1, 0); ioctl(fd2, PERF_EVENT_IOC_ENABLE, 0); ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0); munmap(p, 4096); ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0); } while (1); return NULL; } int main(void) { int i; int n = sysconf(_SC_NPROCESSORS_ONLN); pthread_t *th = calloc(n, sizeof(*th)); for (i = 0; i < n; i++) pthread_create(&th[i], NULL, worker, (void *)(long)i); for (i = 0; i < n; i++) pthread_join(th[i], NULL); free(th); return 0; } And you can see the out of range data using perf stat like this. Probably it'd be easier to see on a large machine. $ gcc -o repro repro.c -pthread $ ./repro & $ sudo perf stat -A -I 1000 2>&1 | awk '{ if (length($3) > 15) print }' 1.001028462 CPU6 196,719,295,683,763 cycles # 194290.996 GHz (71.54%) 1.001028462 CPU3 396,077,485,787,730 branch-misses # 15804359784.80% of all branches (71.07%) 1.001028462 CPU17 197,608,350,727,877 branch-misses # 14594186554.56% of all branches (71.22%) 2.020064073 CPU4 198,372,472,612,140 cycles # 194681.113 GHz (70.95%) 2.020064073 CPU6 199,419,277,896,696 cycles # 195720.007 GHz (70.57%) 2.020064073 CPU20 198,147,174,025,639 cycles # 194474.654 GHz (71.03%) 2.020064073 CPU20 198,421,240,580,145 stalled-cycles-frontend # 100.14% frontend cycles idle (70.93%) 3.037443155 CPU4 197,382,689,923,416 cycles # 194043.065 GHz (71.30%) 3.037443155 CPU20 196,324,797,879,414 cycles # 193003.773 GHz (71.69%) 3.037443155 CPU5 197,679,956,608,205 stalled-cycles-backend # 1315606428.66% backend cycles idle (71.19%) 3.037443155 CPU5 198,571,860,474,851 instructions # 13215422.58 insn per cycle It should move the contents in the cpuc->assign as well. Fixes: 5471eea5d3bf ("perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task") Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240306061003.1894224-1-namhyung@kernel.org --- arch/x86/events/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 09050641ce5d..5b0dd07b1ef1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1644,6 +1644,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; + cpuc->assign[i-1] = cpuc->assign[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; From 04f4230e2f86a4e961ea5466eda3db8c1762004d Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Tue, 9 Apr 2024 16:08:05 -0700 Subject: [PATCH 130/270] x86/bugs: Fix return type of spectre_bhi_state() The definition of spectre_bhi_state() incorrectly returns a const char * const. This causes the a compiler warning when building with W=1: warning: type qualifiers ignored on function return type [-Wignored-qualifiers] 2812 | static const char * const spectre_bhi_state(void) Remove the const qualifier from the pointer. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Reported-by: Sean Christopherson Signed-off-by: Daniel Sneddon Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240409230806.1545822-1-daniel.sneddon@linux.intel.com --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 295463707e68..27f5004a8e24 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2809,7 +2809,7 @@ static char *pbrsb_eibrs_state(void) } } -static const char * const spectre_bhi_state(void) +static const char *spectre_bhi_state(void) { if (!boot_cpu_has_bug(X86_BUG_BHI)) return "; BHI: Not affected"; From f87cbcb345d059f0377b4fa0ba1b766a17fc3710 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Apr 2024 12:29:12 +0200 Subject: [PATCH 131/270] timekeeping: Use READ/WRITE_ONCE() for tick_do_timer_cpu tick_do_timer_cpu is used lockless to check which CPU needs to take care of the per tick timekeeping duty. This is done to avoid a thundering herd problem on jiffies_lock. The read and writes are not annotated so KCSAN complains about data races: BUG: KCSAN: data-race in tick_nohz_idle_stop_tick / tick_nohz_next_event write to 0xffffffff8a2bda30 of 4 bytes by task 0 on cpu 26: tick_nohz_idle_stop_tick+0x3b1/0x4a0 do_idle+0x1e3/0x250 read to 0xffffffff8a2bda30 of 4 bytes by task 0 on cpu 16: tick_nohz_next_event+0xe7/0x1e0 tick_nohz_get_sleep_length+0xa7/0xe0 menu_select+0x82/0xb90 cpuidle_select+0x44/0x60 do_idle+0x1c2/0x250 value changed: 0x0000001a -> 0xffffffff Annotate them with READ/WRITE_ONCE() to document the intentional data race. Reported-by: Mirsad Todorovac Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Tested-by: Sean Anderson Link: https://lore.kernel.org/r/87cyqy7rt3.ffs@tglx --- kernel/time/tick-common.c | 17 +++++++++-------- kernel/time/tick-sched.c | 36 ++++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index fb0fdec8719a..d88b13076b79 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -7,6 +7,7 @@ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ +#include #include #include #include @@ -84,7 +85,7 @@ int tick_is_oneshot_available(void) */ static void tick_periodic(int cpu) { - if (tick_do_timer_cpu == cpu) { + if (READ_ONCE(tick_do_timer_cpu) == cpu) { raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); @@ -215,8 +216,8 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - tick_do_timer_cpu = cpu; + if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) { + WRITE_ONCE(tick_do_timer_cpu, cpu); tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* @@ -232,7 +233,7 @@ static void tick_setup_device(struct tick_device *td, !tick_nohz_full_cpu(cpu)) { tick_take_do_timer_from_boot(); tick_do_timer_boot_cpu = -1; - WARN_ON(tick_do_timer_cpu != cpu); + WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu); #endif } @@ -406,10 +407,10 @@ void tick_assert_timekeeping_handover(void) int tick_cpu_dying(unsigned int dying_cpu) { /* - * If the current CPU is the timekeeper, it's the only one that - * can safely hand over its duty. Also all online CPUs are in - * stop machine, guaranteed not to be idle, therefore it's safe - * to pick any online successor. + * If the current CPU is the timekeeper, it's the only one that can + * safely hand over its duty. Also all online CPUs are in stop + * machine, guaranteed not to be idle, therefore there is no + * concurrency and it's safe to pick any online successor. */ if (tick_do_timer_cpu == dying_cpu) tick_do_timer_cpu = cpumask_first(cpu_online_mask); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1331216a9cae..71a792cd8936 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -8,6 +8,7 @@ * * Started by: Thomas Gleixner and Ingo Molnar */ +#include #include #include #include @@ -204,7 +205,7 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts, static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { - int cpu = smp_processor_id(); + int tick_cpu, cpu = smp_processor_id(); /* * Check if the do_timer duty was dropped. We don't care about @@ -216,16 +217,18 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) * If nohz_full is enabled, this should not happen because the * 'tick_do_timer_cpu' CPU never relinquishes. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && - unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { + tick_cpu = READ_ONCE(tick_do_timer_cpu); + + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif - tick_do_timer_cpu = cpu; + WRITE_ONCE(tick_do_timer_cpu, cpu); + tick_cpu = cpu; } /* Check if jiffies need an update */ - if (tick_do_timer_cpu == cpu) + if (tick_cpu == cpu) tick_do_update_jiffies64(now); /* @@ -610,7 +613,7 @@ bool tick_nohz_cpu_hotpluggable(unsigned int cpu) * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ - if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) return false; return true; } @@ -891,6 +894,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; + int tick_cpu; basemono = get_jiffies_update(&basejiff); ts->last_jiffies = basejiff; @@ -947,9 +951,9 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); - if (cpu != tick_do_timer_cpu && - (tick_do_timer_cpu != TICK_DO_TIMER_NONE || - !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) + tick_cpu = READ_ONCE(tick_do_timer_cpu); + if (tick_cpu != cpu && + (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) delta = KTIME_MAX; /* Calculate the next expiry time */ @@ -970,6 +974,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) unsigned long basejiff = ts->last_jiffies; u64 basemono = ts->timer_expires_base; bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); + int tick_cpu; u64 expires; /* Make sure we won't be trying to stop it twice in a row. */ @@ -1007,10 +1012,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ - if (cpu == tick_do_timer_cpu) { - tick_do_timer_cpu = TICK_DO_TIMER_NONE; + tick_cpu = READ_ONCE(tick_do_timer_cpu); + if (tick_cpu == cpu) { + WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); - } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + } else if (tick_cpu != TICK_DO_TIMER_NONE) { tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); } @@ -1173,15 +1179,17 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; if (tick_nohz_full_enabled()) { + int tick_cpu = READ_ONCE(tick_do_timer_cpu); + /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ - if (tick_do_timer_cpu == cpu) + if (tick_cpu == cpu) return false; /* Should not happen for nohz-full */ - if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) return false; } From a9025cd1c673a8d6eefc79d911075b8b452eba8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Apr 2024 15:22:01 +0200 Subject: [PATCH 132/270] x86/topology: Don't update cpu_possible_map in topo_set_cpuids() topo_set_cpuids() updates cpu_present_map and cpu_possible map. It is invoked during enumeration and "physical hotplug" operations. In the latter case this results in a kernel crash because cpu_possible_map is marked read only after init completes. There is no reason to update cpu_possible_map in that function. During enumeration cpu_possible_map is not relevant and gets fully initialized after enumeration completed. On "physical hotplug" the bit is already set because the kernel allows only CPUs to be plugged which have been enumerated and associated to a CPU number during early boot. Remove the bogus update of cpu_possible_map. Fixes: 0e53e7b656cf ("x86/cpu/topology: Sanitize the APIC admission logic") Reported-by: Jonathan Cameron Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87ttkc6kwx.ffs@tglx --- arch/x86/kernel/cpu/topology.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index aaca8d235dc2..d17c9b71eb4a 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -123,7 +123,6 @@ static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id; early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id; #endif - set_cpu_possible(cpu, true); set_cpu_present(cpu, true); } @@ -210,7 +209,11 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) topo_info.nr_disabled_cpus++; } - /* Register present and possible CPUs in the domain maps */ + /* + * Register present and possible CPUs in the domain + * maps. cpu_possible_map will be updated in + * topology_init_possible_cpus() after enumeration is done. + */ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map); } From f337a6a21e2fd67eadea471e93d05dd37baaa9be Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Apr 2024 10:51:05 -0700 Subject: [PATCH 133/270] x86/cpu: Actually turn off mitigations by default for SPECULATION_MITIGATIONS=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initialize cpu_mitigations to CPU_MITIGATIONS_OFF if the kernel is built with CONFIG_SPECULATION_MITIGATIONS=n, as the help text quite clearly states that disabling SPECULATION_MITIGATIONS is supposed to turn off all mitigations by default. │ If you say N, all mitigations will be disabled. You really │ should know what you are doing to say so. As is, the kernel still defaults to CPU_MITIGATIONS_AUTO, which results in some mitigations being enabled in spite of SPECULATION_MITIGATIONS=n. Fixes: f43b9876e857 ("x86/retbleed: Add fine grained Kconfig knobs") Signed-off-by: Sean Christopherson Signed-off-by: Ingo Molnar Reviewed-by: Daniel Sneddon Cc: stable@vger.kernel.org Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240409175108.1512861-2-seanjc@google.com --- kernel/cpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 8f6affd051f7..07ad53b7f119 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3207,7 +3207,8 @@ enum cpu_mitigations { }; static enum cpu_mitigations cpu_mitigations __ro_after_init = - CPU_MITIGATIONS_AUTO; + IS_ENABLED(CONFIG_SPECULATION_MITIGATIONS) ? CPU_MITIGATIONS_AUTO : + CPU_MITIGATIONS_OFF; static int __init mitigations_parse_cmdline(char *arg) { From e3ba51ab24fddef79fc212f9840de54db8fd1685 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 5 Apr 2024 13:58:50 +1000 Subject: [PATCH 134/270] arm64: tlb: Fix TLBI RANGE operand KVM/arm64 relies on TLBI RANGE feature to flush TLBs when the dirty pages are collected by VMM and the page table entries become write protected during live migration. Unfortunately, the operand passed to the TLBI RANGE instruction isn't correctly sorted out due to the commit 117940aa6e5f ("KVM: arm64: Define kvm_tlb_flush_vmid_range()"). It leads to crash on the destination VM after live migration because TLBs aren't flushed completely and some of the dirty pages are missed. For example, I have a VM where 8GB memory is assigned, starting from 0x40000000 (1GB). Note that the host has 4KB as the base page size. In the middile of migration, kvm_tlb_flush_vmid_range() is executed to flush TLBs. It passes MAX_TLBI_RANGE_PAGES as the argument to __kvm_tlb_flush_vmid_range() and __flush_s2_tlb_range_op(). SCALE#3 and NUM#31, corresponding to MAX_TLBI_RANGE_PAGES, isn't supported by __TLBI_RANGE_NUM(). In this specific case, -1 has been returned from __TLBI_RANGE_NUM() for SCALE#3/2/1/0 and rejected by the loop in the __flush_tlb_range_op() until the variable @scale underflows and becomes -9, 0xffff708000040000 is set as the operand. The operand is wrong since it's sorted out by __TLBI_VADDR_RANGE() according to invalid @scale and @num. Fix it by extending __TLBI_RANGE_NUM() to support the combination of SCALE#3 and NUM#31. With the changes, [-1 31] instead of [-1 30] can be returned from the macro, meaning the TLBs for 0x200000 pages in the above example can be flushed in one shoot with SCALE#3 and NUM#31. The macro TLBI_RANGE_MASK is dropped since no one uses it any more. The comments are also adjusted accordingly. Fixes: 117940aa6e5f ("KVM: arm64: Define kvm_tlb_flush_vmid_range()") Cc: stable@kernel.org # v6.6+ Reported-by: Yihuang Yu Suggested-by: Marc Zyngier Signed-off-by: Gavin Shan Reviewed-by: Catalin Marinas Reviewed-by: Ryan Roberts Reviewed-by: Anshuman Khandual Reviewed-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240405035852.1532010-2-gshan@redhat.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3b0e8248e1a4..a75de2665d84 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -161,12 +161,18 @@ static inline unsigned long get_trans_granule(void) #define MAX_TLBI_RANGE_PAGES __TLBI_RANGE_PAGES(31, 3) /* - * Generate 'num' values from -1 to 30 with -1 rejected by the - * __flush_tlb_range() loop below. + * Generate 'num' values from -1 to 31 with -1 rejected by the + * __flush_tlb_range() loop below. Its return value is only + * significant for a maximum of MAX_TLBI_RANGE_PAGES pages. If + * 'pages' is more than that, you must iterate over the overall + * range. */ -#define TLBI_RANGE_MASK GENMASK_ULL(4, 0) -#define __TLBI_RANGE_NUM(pages, scale) \ - ((((pages) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK) - 1) +#define __TLBI_RANGE_NUM(pages, scale) \ + ({ \ + int __pages = min((pages), \ + __TLBI_RANGE_PAGES(31, (scale))); \ + (__pages >> (5 * (scale) + 1)) - 1; \ + }) /* * TLB Invalidation @@ -379,10 +385,6 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * 3. If there is 1 page remaining, flush it through non-range operations. Range * operations can only span an even number of pages. We save this for last to * ensure 64KB start alignment is maintained for the LPA2 case. - * - * Note that certain ranges can be represented by either num = 31 and - * scale or num = 0 and scale + 1. The loop below favours the latter - * since num is limited to 30 by the __TLBI_RANGE_NUM() macro. */ #define __flush_tlb_range_op(op, start, pages, stride, \ asid, tlb_level, tlbi_user, lpa2) \ From 5284984a4fbacb0883bfebe905902cdda2891a07 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Apr 2024 18:32:12 +0300 Subject: [PATCH 135/270] bug: Fix no-return-statement warning with !CONFIG_BUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG() does not return, and arch implementations of BUG() use unreachable() or other non-returning code. However with !CONFIG_BUG, the default implementation is often used instead, and that does not do that. x86 always uses its own implementation, but powerpc with !CONFIG_BUG gives a build error: kernel/time/timekeeping.c: In function ‘timekeeping_debug_get_ns’: kernel/time/timekeeping.c:286:1: error: no return statement in function returning non-void [-Werror=return-type] Add unreachable() to default !CONFIG_BUG BUG() implementation. Fixes: e8e9d21a5df6 ("timekeeping: Refactor timekeeping helpers") Reported-by: Naresh Kamboju Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Tested-by: Linux Kernel Functional Testing Link: https://lore.kernel.org/r/20240410153212.127477-1-adrian.hunter@intel.com Closes: https://lore.kernel.org/all/CA+G9fYvjdZCW=7ZGxS6A_3bysjQ56YF7S-+PNLQ_8a4DKh1Bhg@mail.gmail.com/ --- include/asm-generic/bug.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 6e794420bd39..b7de3a4eade1 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -156,7 +156,10 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #else /* !CONFIG_BUG */ #ifndef HAVE_ARCH_BUG -#define BUG() do {} while (1) +#define BUG() do { \ + do {} while (1); \ + unreachable(); \ +} while (0) #endif #ifndef HAVE_ARCH_BUG_ON From 076361362122a6d8a4c45f172ced5576b2d4a50d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 9 Apr 2024 13:22:12 -0700 Subject: [PATCH 136/270] selftests: timers: Fix valid-adjtimex signed left-shift undefined behavior The struct adjtimex freq field takes a signed value who's units are in shifted (<<16) parts-per-million. Unfortunately for negative adjustments, the straightforward use of: freq = ppm << 16 trips undefined behavior warnings with clang: valid-adjtimex.c:66:6: warning: shifting a negative signed value is undefined [-Wshift-negative-value] -499<<16, ~~~~^ valid-adjtimex.c:67:6: warning: shifting a negative signed value is undefined [-Wshift-negative-value] -450<<16, ~~~~^ .. Fix it by using a multiply by (1 << 16) instead of shifting negative values in the valid-adjtimex test case. Align the values for better readability. Reported-by: Lee Jones Reported-by: Muhammad Usama Anjum Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240409202222.2830476-1-jstultz@google.com Link: https://lore.kernel.org/lkml/0c6d4f0d-2064-4444-986b-1d1ed782135f@collabora.com/ --- .../testing/selftests/timers/valid-adjtimex.c | 73 +++++++++---------- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index 48b9a803235a..d13ebde20322 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -21,9 +21,6 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - - - #include #include #include @@ -62,45 +59,47 @@ int clear_time_state(void) #define NUM_FREQ_OUTOFRANGE 4 #define NUM_FREQ_INVALID 2 +#define SHIFTED_PPM (1 << 16) + long valid_freq[NUM_FREQ_VALID] = { - -499<<16, - -450<<16, - -400<<16, - -350<<16, - -300<<16, - -250<<16, - -200<<16, - -150<<16, - -100<<16, - -75<<16, - -50<<16, - -25<<16, - -10<<16, - -5<<16, - -1<<16, + -499 * SHIFTED_PPM, + -450 * SHIFTED_PPM, + -400 * SHIFTED_PPM, + -350 * SHIFTED_PPM, + -300 * SHIFTED_PPM, + -250 * SHIFTED_PPM, + -200 * SHIFTED_PPM, + -150 * SHIFTED_PPM, + -100 * SHIFTED_PPM, + -75 * SHIFTED_PPM, + -50 * SHIFTED_PPM, + -25 * SHIFTED_PPM, + -10 * SHIFTED_PPM, + -5 * SHIFTED_PPM, + -1 * SHIFTED_PPM, -1000, - 1<<16, - 5<<16, - 10<<16, - 25<<16, - 50<<16, - 75<<16, - 100<<16, - 150<<16, - 200<<16, - 250<<16, - 300<<16, - 350<<16, - 400<<16, - 450<<16, - 499<<16, + 1 * SHIFTED_PPM, + 5 * SHIFTED_PPM, + 10 * SHIFTED_PPM, + 25 * SHIFTED_PPM, + 50 * SHIFTED_PPM, + 75 * SHIFTED_PPM, + 100 * SHIFTED_PPM, + 150 * SHIFTED_PPM, + 200 * SHIFTED_PPM, + 250 * SHIFTED_PPM, + 300 * SHIFTED_PPM, + 350 * SHIFTED_PPM, + 400 * SHIFTED_PPM, + 450 * SHIFTED_PPM, + 499 * SHIFTED_PPM, }; long outofrange_freq[NUM_FREQ_OUTOFRANGE] = { - -1000<<16, - -550<<16, - 550<<16, - 1000<<16, + -1000 * SHIFTED_PPM, + -550 * SHIFTED_PPM, + 550 * SHIFTED_PPM, + 1000 * SHIFTED_PPM, }; #define LONG_MAX (~0UL>>1) From d9ea7a3f66a5c7e1a2f73cf4b20f5eff3ced4ff8 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 19 Mar 2024 11:43:50 +0800 Subject: [PATCH 137/270] hv: vmbus: Convert sprintf() family to sysfs_emit() family Per filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Coccinelle complains that there are still a couple of functions that use snprintf(). Convert them to sysfs_emit(). sprintf() and scnprintf() will be converted as well if these files have such abused cases. This patch is generated by make coccicheck M= MODE=patch \ COCCI=scripts/coccinelle/api/device_attr_show.cocci No functional change intended. CC: "K. Y. Srinivasan" CC: Haiyang Zhang CC: Wei Liu CC: Dexuan Cui CC: linux-hyperv@vger.kernel.org Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20240319034350.1574454-1-lizhijian@fujitsu.com Signed-off-by: Wei Liu Message-ID: <20240319034350.1574454-1-lizhijian@fujitsu.com> --- drivers/hv/vmbus_drv.c | 94 +++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 52 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 7f7965f3d187..121f1ab32b51 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -131,7 +131,7 @@ static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->offermsg.child_relid); + return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.child_relid); } static DEVICE_ATTR_RO(id); @@ -142,7 +142,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->state); + return sysfs_emit(buf, "%d\n", hv_dev->channel->state); } static DEVICE_ATTR_RO(state); @@ -153,7 +153,7 @@ static ssize_t monitor_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->offermsg.monitorid); + return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.monitorid); } static DEVICE_ATTR_RO(monitor_id); @@ -164,8 +164,8 @@ static ssize_t class_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "{%pUl}\n", - &hv_dev->channel->offermsg.offer.if_type); + return sysfs_emit(buf, "{%pUl}\n", + &hv_dev->channel->offermsg.offer.if_type); } static DEVICE_ATTR_RO(class_id); @@ -176,8 +176,8 @@ static ssize_t device_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "{%pUl}\n", - &hv_dev->channel->offermsg.offer.if_instance); + return sysfs_emit(buf, "{%pUl}\n", + &hv_dev->channel->offermsg.offer.if_instance); } static DEVICE_ATTR_RO(device_id); @@ -186,7 +186,7 @@ static ssize_t modalias_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); + return sysfs_emit(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); } static DEVICE_ATTR_RO(modalias); @@ -199,7 +199,7 @@ static ssize_t numa_node_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); + return sysfs_emit(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); } static DEVICE_ATTR_RO(numa_node); #endif @@ -212,9 +212,8 @@ static ssize_t server_monitor_pending_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_pending(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_pending); @@ -226,9 +225,8 @@ static ssize_t client_monitor_pending_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_pending(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_pending); @@ -240,9 +238,8 @@ static ssize_t server_monitor_latency_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_latency(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_latency); @@ -254,9 +251,8 @@ static ssize_t client_monitor_latency_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_latency(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_latency); @@ -268,9 +264,8 @@ static ssize_t server_monitor_conn_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_conn_id(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_conn_id); @@ -282,9 +277,8 @@ static ssize_t client_monitor_conn_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_conn_id(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_conn_id); @@ -303,7 +297,7 @@ static ssize_t out_intr_mask_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_interrupt_mask); + return sysfs_emit(buf, "%d\n", outbound.current_interrupt_mask); } static DEVICE_ATTR_RO(out_intr_mask); @@ -321,7 +315,7 @@ static ssize_t out_read_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_read_index); + return sysfs_emit(buf, "%d\n", outbound.current_read_index); } static DEVICE_ATTR_RO(out_read_index); @@ -340,7 +334,7 @@ static ssize_t out_write_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_write_index); + return sysfs_emit(buf, "%d\n", outbound.current_write_index); } static DEVICE_ATTR_RO(out_write_index); @@ -359,7 +353,7 @@ static ssize_t out_read_bytes_avail_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.bytes_avail_toread); + return sysfs_emit(buf, "%d\n", outbound.bytes_avail_toread); } static DEVICE_ATTR_RO(out_read_bytes_avail); @@ -378,7 +372,7 @@ static ssize_t out_write_bytes_avail_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.bytes_avail_towrite); + return sysfs_emit(buf, "%d\n", outbound.bytes_avail_towrite); } static DEVICE_ATTR_RO(out_write_bytes_avail); @@ -396,7 +390,7 @@ static ssize_t in_intr_mask_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_interrupt_mask); + return sysfs_emit(buf, "%d\n", inbound.current_interrupt_mask); } static DEVICE_ATTR_RO(in_intr_mask); @@ -414,7 +408,7 @@ static ssize_t in_read_index_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_read_index); + return sysfs_emit(buf, "%d\n", inbound.current_read_index); } static DEVICE_ATTR_RO(in_read_index); @@ -432,7 +426,7 @@ static ssize_t in_write_index_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_write_index); + return sysfs_emit(buf, "%d\n", inbound.current_write_index); } static DEVICE_ATTR_RO(in_write_index); @@ -451,7 +445,7 @@ static ssize_t in_read_bytes_avail_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.bytes_avail_toread); + return sysfs_emit(buf, "%d\n", inbound.bytes_avail_toread); } static DEVICE_ATTR_RO(in_read_bytes_avail); @@ -470,7 +464,7 @@ static ssize_t in_write_bytes_avail_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.bytes_avail_towrite); + return sysfs_emit(buf, "%d\n", inbound.bytes_avail_towrite); } static DEVICE_ATTR_RO(in_write_bytes_avail); @@ -480,7 +474,7 @@ static ssize_t channel_vp_mapping_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); struct vmbus_channel *channel = hv_dev->channel, *cur_sc; - int buf_size = PAGE_SIZE, n_written, tot_written; + int n_written; struct list_head *cur; if (!channel) @@ -488,25 +482,21 @@ static ssize_t channel_vp_mapping_show(struct device *dev, mutex_lock(&vmbus_connection.channel_mutex); - tot_written = snprintf(buf, buf_size, "%u:%u\n", - channel->offermsg.child_relid, channel->target_cpu); + n_written = sysfs_emit(buf, "%u:%u\n", + channel->offermsg.child_relid, + channel->target_cpu); list_for_each(cur, &channel->sc_list) { - if (tot_written >= buf_size - 1) - break; cur_sc = list_entry(cur, struct vmbus_channel, sc_list); - n_written = scnprintf(buf + tot_written, - buf_size - tot_written, - "%u:%u\n", - cur_sc->offermsg.child_relid, - cur_sc->target_cpu); - tot_written += n_written; + n_written += sysfs_emit_at(buf, n_written, "%u:%u\n", + cur_sc->offermsg.child_relid, + cur_sc->target_cpu); } mutex_unlock(&vmbus_connection.channel_mutex); - return tot_written; + return n_written; } static DEVICE_ATTR_RO(channel_vp_mapping); @@ -516,7 +506,7 @@ static ssize_t vendor_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "0x%x\n", hv_dev->vendor_id); + return sysfs_emit(buf, "0x%x\n", hv_dev->vendor_id); } static DEVICE_ATTR_RO(vendor); @@ -526,7 +516,7 @@ static ssize_t device_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "0x%x\n", hv_dev->device_id); + return sysfs_emit(buf, "0x%x\n", hv_dev->device_id); } static DEVICE_ATTR_RO(device); @@ -551,7 +541,7 @@ static ssize_t driver_override_show(struct device *dev, ssize_t len; device_lock(dev); - len = snprintf(buf, PAGE_SIZE, "%s\n", hv_dev->driver_override); + len = sysfs_emit(buf, "%s\n", hv_dev->driver_override); device_unlock(dev); return len; From f971f6dd3742d22dd13710306fb4365ea7bcb536 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Fri, 22 Mar 2024 06:46:02 -0700 Subject: [PATCH 138/270] hv/hv_kvp_daemon: Handle IPv4 and Ipv6 combination for keyfile format If the network configuration strings are passed as a combination of IPv4 and IPv6 addresses, the current KVP daemon does not handle processing for the keyfile configuration format. With these changes, the keyfile config generation logic scans through the list twice to generate IPv4 and IPv6 sections for the configuration files to handle this support. Testcases ran:Rhel 9, Hyper-V VMs (IPv4 only, IPv6 only, IPv4 and IPv6 combination) Co-developed-by: Ani Sinha Signed-off-by: Ani Sinha Signed-off-by: Shradha Gupta Reviewed-by: Easwar Hariharan Tested-by: Ani Sinha Reviewed-by: Ani Sinha Link: https://lore.kernel.org/r/1711115162-11629-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711115162-11629-1-git-send-email-shradhagupta@linux.microsoft.com> --- tools/hv/hv_kvp_daemon.c | 215 +++++++++++++++++++++++++++++++-------- 1 file changed, 173 insertions(+), 42 deletions(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 318e2dad27e0..ae57bf69ad4a 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -76,6 +76,12 @@ enum { DNS }; +enum { + IPV4 = 1, + IPV6, + IP_TYPE_MAX +}; + static int in_hand_shake; static char *os_name = ""; @@ -102,6 +108,11 @@ static struct utsname uts_buf; #define MAX_FILE_NAME 100 #define ENTRIES_PER_BLOCK 50 +/* + * Change this entry if the number of addresses increases in future + */ +#define MAX_IP_ENTRIES 64 +#define OUTSTR_BUF_SIZE ((INET6_ADDRSTRLEN + 1) * MAX_IP_ENTRIES) struct kvp_record { char key[HV_KVP_EXCHANGE_MAX_KEY_SIZE]; @@ -1171,6 +1182,18 @@ static int process_ip_string(FILE *f, char *ip_string, int type) return 0; } +int ip_version_check(const char *input_addr) +{ + struct in6_addr addr; + + if (inet_pton(AF_INET, input_addr, &addr)) + return IPV4; + else if (inet_pton(AF_INET6, input_addr, &addr)) + return IPV6; + + return -EINVAL; +} + /* * Only IPv4 subnet strings needs to be converted to plen * For IPv6 the subnet is already privided in plen format @@ -1197,14 +1220,75 @@ static int kvp_subnet_to_plen(char *subnet_addr_str) return plen; } +static int process_dns_gateway_nm(FILE *f, char *ip_string, int type, + int ip_sec) +{ + char addr[INET6_ADDRSTRLEN], *output_str; + int ip_offset = 0, error = 0, ip_ver; + char *param_name; + + if (type == DNS) + param_name = "dns"; + else if (type == GATEWAY) + param_name = "gateway"; + else + return -EINVAL; + + output_str = (char *)calloc(OUTSTR_BUF_SIZE, sizeof(char)); + if (!output_str) + return -ENOMEM; + + while (1) { + memset(addr, 0, sizeof(addr)); + + if (!parse_ip_val_buffer(ip_string, &ip_offset, addr, + (MAX_IP_ADDR_SIZE * 2))) + break; + + ip_ver = ip_version_check(addr); + if (ip_ver < 0) + continue; + + if ((ip_ver == IPV4 && ip_sec == IPV4) || + (ip_ver == IPV6 && ip_sec == IPV6)) { + /* + * do a bound check to avoid out-of bound writes + */ + if ((OUTSTR_BUF_SIZE - strlen(output_str)) > + (strlen(addr) + 1)) { + strncat(output_str, addr, + OUTSTR_BUF_SIZE - + strlen(output_str) - 1); + strncat(output_str, ",", + OUTSTR_BUF_SIZE - + strlen(output_str) - 1); + } + } else { + continue; + } + } + + if (strlen(output_str)) { + /* + * This is to get rid of that extra comma character + * in the end of the string + */ + output_str[strlen(output_str) - 1] = '\0'; + error = fprintf(f, "%s=%s\n", param_name, output_str); + } + + free(output_str); + return error; +} + static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, - int is_ipv6) + int ip_sec) { char addr[INET6_ADDRSTRLEN]; char subnet_addr[INET6_ADDRSTRLEN]; - int error, i = 0; + int error = 0, i = 0; int ip_offset = 0, subnet_offset = 0; - int plen; + int plen, ip_ver; memset(addr, 0, sizeof(addr)); memset(subnet_addr, 0, sizeof(subnet_addr)); @@ -1216,10 +1300,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, subnet_addr, (MAX_IP_ADDR_SIZE * 2))) { - if (!is_ipv6) + ip_ver = ip_version_check(addr); + if (ip_ver < 0) + continue; + + if (ip_ver == IPV4 && ip_sec == IPV4) plen = kvp_subnet_to_plen((char *)subnet_addr); - else + else if (ip_ver == IPV6 && ip_sec == IPV6) plen = atoi(subnet_addr); + else + continue; if (plen < 0) return plen; @@ -1233,17 +1323,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, memset(subnet_addr, 0, sizeof(subnet_addr)); } - return 0; + return error; } static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) { - int error = 0; + int error = 0, ip_ver; char if_filename[PATH_MAX]; char nm_filename[PATH_MAX]; FILE *ifcfg_file, *nmfile; char cmd[PATH_MAX]; - int is_ipv6 = 0; char *mac_addr; int str_len; @@ -1421,52 +1510,94 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) if (error) goto setval_error; - if (new_val->addr_family & ADDR_FAMILY_IPV6) { - error = fprintf(nmfile, "\n[ipv6]\n"); - if (error < 0) - goto setval_error; - is_ipv6 = 1; - } else { - error = fprintf(nmfile, "\n[ipv4]\n"); - if (error < 0) - goto setval_error; - } - /* * Now we populate the keyfile format + * + * The keyfile format expects the IPv6 and IPv4 configuration in + * different sections. Therefore we iterate through the list twice, + * once to populate the IPv4 section and the next time for IPv6 */ + ip_ver = IPV4; + do { + if (ip_ver == IPV4) { + error = fprintf(nmfile, "\n[ipv4]\n"); + if (error < 0) + goto setval_error; + } else { + error = fprintf(nmfile, "\n[ipv6]\n"); + if (error < 0) + goto setval_error; + } - if (new_val->dhcp_enabled) { - error = kvp_write_file(nmfile, "method", "", "auto"); + /* + * Write the configuration for ipaddress, netmask, gateway and + * name services + */ + error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr, + (char *)new_val->sub_net, + ip_ver); if (error < 0) goto setval_error; - } else { - error = kvp_write_file(nmfile, "method", "", "manual"); + + /* + * As dhcp_enabled is only valid for ipv4, we do not set dhcp + * methods for ipv6 based on dhcp_enabled flag. + * + * For ipv4, set method to manual only when dhcp_enabled is + * false and specific ipv4 addresses are configured. If neither + * dhcp_enabled is true and no ipv4 addresses are configured, + * set method to 'disabled'. + * + * For ipv6, set method to manual when we configure ipv6 + * addresses. Otherwise set method to 'auto' so that SLAAC from + * RA may be used. + */ + if (ip_ver == IPV4) { + if (new_val->dhcp_enabled) { + error = kvp_write_file(nmfile, "method", "", + "auto"); + if (error < 0) + goto setval_error; + } else if (error) { + error = kvp_write_file(nmfile, "method", "", + "manual"); + if (error < 0) + goto setval_error; + } else { + error = kvp_write_file(nmfile, "method", "", + "disabled"); + if (error < 0) + goto setval_error; + } + } else if (ip_ver == IPV6) { + if (error) { + error = kvp_write_file(nmfile, "method", "", + "manual"); + if (error < 0) + goto setval_error; + } else { + error = kvp_write_file(nmfile, "method", "", + "auto"); + if (error < 0) + goto setval_error; + } + } + + error = process_dns_gateway_nm(nmfile, + (char *)new_val->gate_way, + GATEWAY, ip_ver); if (error < 0) goto setval_error; - } - /* - * Write the configuration for ipaddress, netmask, gateway and - * name services - */ - error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr, - (char *)new_val->sub_net, is_ipv6); - if (error < 0) - goto setval_error; - - /* we do not want ipv4 addresses in ipv6 section and vice versa */ - if (is_ipv6 != is_ipv4((char *)new_val->gate_way)) { - error = fprintf(nmfile, "gateway=%s\n", (char *)new_val->gate_way); + error = process_dns_gateway_nm(nmfile, + (char *)new_val->dns_addr, DNS, + ip_ver); if (error < 0) goto setval_error; - } - if (is_ipv6 != is_ipv4((char *)new_val->dns_addr)) { - error = fprintf(nmfile, "dns=%s\n", (char *)new_val->dns_addr); - if (error < 0) - goto setval_error; - } + ip_ver++; + } while (ip_ver < IP_TYPE_MAX); + fclose(nmfile); fclose(ifcfg_file); From 03f5a999adba062456c8c818a683beb1b498983a Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:54 -0700 Subject: [PATCH 139/270] Drivers: hv: vmbus: Leak pages if set_memory_encrypted() fails In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. VMBus code could free decrypted pages if set_memory_encrypted()/decrypted() fails. Leak the pages if this happens. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-2-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-2-mhklinux@outlook.com> --- drivers/hv/connection.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 3cabeeabb1ca..f001ae880e1d 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -237,8 +237,17 @@ int vmbus_connect(void) vmbus_connection.monitor_pages[0], 1); ret |= set_memory_decrypted((unsigned long) vmbus_connection.monitor_pages[1], 1); - if (ret) + if (ret) { + /* + * If set_memory_decrypted() fails, the encryption state + * of the memory is unknown. So leak the memory instead + * of risking returning decrypted memory to the free list. + * For simplicity, always handle both pages the same. + */ + vmbus_connection.monitor_pages[0] = NULL; + vmbus_connection.monitor_pages[1] = NULL; goto cleanup; + } /* * Set_memory_decrypted() will change the memory contents if @@ -337,13 +346,19 @@ void vmbus_disconnect(void) vmbus_connection.int_page = NULL; } - set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[0], 1); - set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[1], 1); + if (vmbus_connection.monitor_pages[0]) { + if (!set_memory_encrypted( + (unsigned long)vmbus_connection.monitor_pages[0], 1)) + hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); + vmbus_connection.monitor_pages[0] = NULL; + } - hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); - hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); - vmbus_connection.monitor_pages[0] = NULL; - vmbus_connection.monitor_pages[1] = NULL; + if (vmbus_connection.monitor_pages[1]) { + if (!set_memory_encrypted( + (unsigned long)vmbus_connection.monitor_pages[1], 1)) + hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); + vmbus_connection.monitor_pages[1] = NULL; + } } /* From 211f514ebf1ef5de37b1cf6df9d28a56cfd242ca Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:55 -0700 Subject: [PATCH 140/270] Drivers: hv: vmbus: Track decrypted status in vmbus_gpadl In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. In order to make sure callers of vmbus_establish_gpadl() and vmbus_teardown_gpadl() don't return decrypted/shared pages to allocators, add a field in struct vmbus_gpadl to keep track of the decryption status of the buffers. This will allow the callers to know if they should free or leak the pages. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-3-mhklinux@outlook.com> --- drivers/hv/channel.c | 25 +++++++++++++++++++++---- include/linux/hyperv.h | 1 + 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index adbf674355b2..98259b492502 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -436,9 +436,18 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, (atomic_inc_return(&vmbus_connection.next_gpadl_handle) - 1); ret = create_gpadl_header(type, kbuffer, size, send_offset, &msginfo); - if (ret) + if (ret) { + gpadl->decrypted = false; return ret; + } + /* + * Set the "decrypted" flag to true for the set_memory_decrypted() + * success case. In the failure case, the encryption state of the + * memory is unknown. Leave "decrypted" as true to ensure the + * memory will be leaked instead of going back on the free list. + */ + gpadl->decrypted = true; ret = set_memory_decrypted((unsigned long)kbuffer, PFN_UP(size)); if (ret) { @@ -527,9 +536,15 @@ cleanup: kfree(msginfo); - if (ret) - set_memory_encrypted((unsigned long)kbuffer, - PFN_UP(size)); + if (ret) { + /* + * If set_memory_encrypted() fails, the decrypted flag is + * left as true so the memory is leaked instead of being + * put back on the free list. + */ + if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) + gpadl->decrypted = false; + } return ret; } @@ -850,6 +865,8 @@ post_msg_err: if (ret) pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret); + gpadl->decrypted = ret; + return ret; } EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 6ef0557b4bff..96ceb4095425 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -832,6 +832,7 @@ struct vmbus_gpadl { u32 gpadl_handle; u32 size; void *buffer; + bool decrypted; }; struct vmbus_channel { From bbf9ac34677b57506a13682b31a2a718934c0e31 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:56 -0700 Subject: [PATCH 141/270] hv_netvsc: Don't free decrypted memory In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. The netvsc driver could free decrypted/shared pages if set_memory_decrypted() fails. Check the decrypted field in the gpadl to decide whether to free the memory. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-4-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-4-mhklinux@outlook.com> --- drivers/net/hyperv/netvsc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index a6fcbda64ecc..2b6ec979a62f 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -154,8 +154,11 @@ static void free_netvsc_device(struct rcu_head *head) int i; kfree(nvdev->extension); - vfree(nvdev->recv_buf); - vfree(nvdev->send_buf); + + if (!nvdev->recv_buf_gpadl_handle.decrypted) + vfree(nvdev->recv_buf); + if (!nvdev->send_buf_gpadl_handle.decrypted) + vfree(nvdev->send_buf); bitmap_free(nvdev->send_section_map); for (i = 0; i < VRSS_CHANNEL_MAX; i++) { From 3d788b2fbe6a1a1a9e3db09742b90809d51638b7 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:57 -0700 Subject: [PATCH 142/270] uio_hv_generic: Don't free decrypted memory In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. The VMBus device UIO driver could free decrypted/shared pages if set_memory_decrypted() fails. Check the decrypted field in the gpadl to decide whether to free the memory. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-5-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-5-mhklinux@outlook.com> --- drivers/uio/uio_hv_generic.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/uio/uio_hv_generic.c b/drivers/uio/uio_hv_generic.c index 20d9762331bd..6be3462b109f 100644 --- a/drivers/uio/uio_hv_generic.c +++ b/drivers/uio/uio_hv_generic.c @@ -181,12 +181,14 @@ hv_uio_cleanup(struct hv_device *dev, struct hv_uio_private_data *pdata) { if (pdata->send_gpadl.gpadl_handle) { vmbus_teardown_gpadl(dev->channel, &pdata->send_gpadl); - vfree(pdata->send_buf); + if (!pdata->send_gpadl.decrypted) + vfree(pdata->send_buf); } if (pdata->recv_gpadl.gpadl_handle) { vmbus_teardown_gpadl(dev->channel, &pdata->recv_gpadl); - vfree(pdata->recv_buf); + if (!pdata->recv_gpadl.decrypted) + vfree(pdata->recv_buf); } } @@ -295,7 +297,8 @@ hv_uio_probe(struct hv_device *dev, ret = vmbus_establish_gpadl(channel, pdata->recv_buf, RECV_BUFFER_SIZE, &pdata->recv_gpadl); if (ret) { - vfree(pdata->recv_buf); + if (!pdata->recv_gpadl.decrypted) + vfree(pdata->recv_buf); goto fail_close; } @@ -317,7 +320,8 @@ hv_uio_probe(struct hv_device *dev, ret = vmbus_establish_gpadl(channel, pdata->send_buf, SEND_BUFFER_SIZE, &pdata->send_gpadl); if (ret) { - vfree(pdata->send_buf); + if (!pdata->send_gpadl.decrypted) + vfree(pdata->send_buf); goto fail_close; } From 30d18df6567be09c1433e81993e35e3da573ac48 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 11 Mar 2024 09:15:58 -0700 Subject: [PATCH 143/270] Drivers: hv: vmbus: Don't free ring buffers that couldn't be re-encrypted In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. The VMBus ring buffer code could free decrypted/shared pages if set_memory_decrypted() fails. Check the decrypted field in the struct vmbus_gpadl for the ring buffers to decide whether to free the memory. Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-6-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-6-mhklinux@outlook.com> --- drivers/hv/channel.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 98259b492502..fb8cd8469328 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -153,7 +153,9 @@ void vmbus_free_ring(struct vmbus_channel *channel) hv_ringbuffer_cleanup(&channel->inbound); if (channel->ringbuffer_page) { - __free_pages(channel->ringbuffer_page, + /* In a CoCo VM leak the memory if it didn't get re-encrypted */ + if (!channel->ringbuffer_gpadlhandle.decrypted) + __free_pages(channel->ringbuffer_page, get_order(channel->ringbuffer_pagecount << PAGE_SHIFT)); channel->ringbuffer_page = NULL; From a4833e3abae132d613ce7da0e0c9a9465d1681fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 10 Apr 2024 12:38:13 -0400 Subject: [PATCH 144/270] SUNRPC: Fix rpcgss_context trace event acceptor field The rpcgss_context trace event acceptor field is a dynamically sized string that records the "data" parameter. But this parameter is also dependent on the "len" field to determine the size of the data. It needs to use __string_len() helper macro where the length can be passed in. It also incorrectly uses strncpy() to save it instead of __assign_str(). As these macros can change, it is not wise to open code them in trace events. As of commit c759e609030c ("tracing: Remove __assign_str_len()"), __assign_str() can be used for both __string() and __string_len() fields. Before that commit, __assign_str_len() is required to be used. This needs to be noted for backporting. (In actuality, commit c1fa617caeb0 ("tracing: Rework __assign_str() and __string() to not duplicate getting the string") is the commit that makes __string_str_len() obsolete). Cc: stable@vger.kernel.org Fixes: 0c77668ddb4e ("SUNRPC: Introduce trace points in rpc_auth_gss.ko") Signed-off-by: Steven Rostedt (Google) Signed-off-by: Chuck Lever --- include/trace/events/rpcgss.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index ba2d96a1bc2f..f50fcafc69de 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -609,7 +609,7 @@ TRACE_EVENT(rpcgss_context, __field(unsigned int, timeout) __field(u32, window_size) __field(int, len) - __string(acceptor, data) + __string_len(acceptor, data, len) ), TP_fast_assign( @@ -618,7 +618,7 @@ TRACE_EVENT(rpcgss_context, __entry->timeout = timeout; __entry->window_size = window_size; __entry->len = len; - strncpy(__get_str(acceptor), data, len); + __assign_str(acceptor, data); ), TP_printk("win_size=%u expiry=%lu now=%lu timeout=%u acceptor=%.*s", From ec4535b2a1d709d3a1fbec26739c672f13c98a7b Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 8 Apr 2024 18:32:17 -0300 Subject: [PATCH 145/270] smb: client: fix NULL ptr deref in cifs_mark_open_handles_for_deleted_file() cifs_get_fattr() may be called with a NULL inode, so check for a non-NULL inode before calling cifs_mark_open_handles_for_deleted_file(). This fixes the following oops: mount.cifs //srv/share /mnt -o ...,vers=3.1.1 cd /mnt touch foo; tail -f foo & rm foo cat foo BUG: kernel NULL pointer dereference, address: 00000000000005c0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 2 PID: 696 Comm: cat Not tainted 6.9.0-rc2 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014 RIP: 0010:__lock_acquire+0x5d/0x1c70 Code: 00 00 44 8b a4 24 a0 00 00 00 45 85 f6 0f 84 bb 06 00 00 8b 2d 48 e2 95 01 45 89 c3 41 89 d2 45 89 c8 85 ed 0 0 <48> 81 3f 40 7a 76 83 44 0f 44 d8 83 fe 01 0f 86 1b 03 00 00 31 d2 RSP: 0018:ffffc90000b37490 EFLAGS: 00010002 RAX: 0000000000000000 RBX: ffff888110021ec0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000000005c0 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000200 FS: 00007f2a1fa08740(0000) GS:ffff888157a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000005c0 CR3: 000000011ac7c000 CR4: 0000000000750ef0 PKRU: 55555554 Call Trace: ? __die+0x23/0x70 ? page_fault_oops+0x180/0x490 ? srso_alias_return_thunk+0x5/0xfbef5 ? exc_page_fault+0x70/0x230 ? asm_exc_page_fault+0x26/0x30 ? __lock_acquire+0x5d/0x1c70 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 lock_acquire+0xc0/0x2d0 ? cifs_mark_open_handles_for_deleted_file+0x3a/0x100 [cifs] ? srso_alias_return_thunk+0x5/0xfbef5 ? kmem_cache_alloc+0x2d9/0x370 _raw_spin_lock+0x34/0x80 ? cifs_mark_open_handles_for_deleted_file+0x3a/0x100 [cifs] cifs_mark_open_handles_for_deleted_file+0x3a/0x100 [cifs] cifs_get_fattr+0x24c/0x940 [cifs] ? srso_alias_return_thunk+0x5/0xfbef5 cifs_get_inode_info+0x96/0x120 [cifs] cifs_lookup+0x16e/0x800 [cifs] cifs_atomic_open+0xc7/0x5d0 [cifs] ? lookup_open.isra.0+0x3ce/0x5f0 ? __pfx_cifs_atomic_open+0x10/0x10 [cifs] lookup_open.isra.0+0x3ce/0x5f0 path_openat+0x42b/0xc30 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 do_filp_open+0xc4/0x170 do_sys_openat2+0xab/0xe0 __x64_sys_openat+0x57/0xa0 do_syscall_64+0xc1/0x1e0 entry_SYSCALL_64_after_hwframe+0x72/0x7a Fixes: ffceb7640cbf ("smb: client: do not defer close open handles to deleted files") Reviewed-by: Meetakshi Setiya Reviewed-by: Bharath SM Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 91b07ef9e25c..60afab5c83d4 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1105,7 +1105,8 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, } else { cifs_open_info_to_fattr(fattr, data, sb); } - if (!rc && fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) + if (!rc && *inode && + (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING)) cifs_mark_open_handles_for_deleted_file(*inode, full_path); break; case -EREMOTE: From dfe648903f42296866d79f10d03f8c85c9dfba30 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:45 -0700 Subject: [PATCH 146/270] x86/bugs: Fix BHI documentation Fix up some inaccuracies in the BHI documentation. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/8c84f7451bfe0dd08543c6082a383f390d4aa7e2.1712813475.git.jpoimboe@kernel.org --- Documentation/admin-guide/hw-vuln/spectre.rst | 15 ++++++++------- Documentation/admin-guide/kernel-parameters.txt | 12 +++++++----- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index b70b1d8bd8e6..3cf18e4a1d9a 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -439,11 +439,11 @@ The possible values in this file are: - System is protected by retpoline * - BHI: BHI_DIS_S - System is protected by BHI_DIS_S - * - BHI: SW loop; KVM SW loop + * - BHI: SW loop, KVM SW loop - System is protected by software clearing sequence * - BHI: Syscall hardening - Syscalls are hardened against BHI - * - BHI: Syscall hardening; KVM: SW loop + * - BHI: Syscall hardening, KVM: SW loop - System is protected from userspace attacks by syscall hardening; KVM is protected by software clearing sequence Full mitigation might require a microcode update from the CPU @@ -666,13 +666,14 @@ kernel command line. of the HW BHI control and the SW BHB clearing sequence. on - unconditionally enable. + (default) Enable the HW or SW mitigation as + needed. off - unconditionally disable. + Disable the mitigation. auto - enable if hardware mitigation - control(BHI_DIS_S) is available, otherwise - enable alternate mitigation in KVM. + Enable the HW mitigation if needed, but + *don't* enable the SW mitigation except for KVM. + The system may be vulnerable. For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 70046a019d42..a029ad6c4963 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3444,6 +3444,7 @@ retbleed=off [X86] spec_rstack_overflow=off [X86] spec_store_bypass_disable=off [X86,PPC] + spectre_bhi=off [X86] spectre_v2_user=off [X86] srbds=off [X86,INTEL] ssbd=force-off [ARM64] @@ -6069,11 +6070,12 @@ deployment of the HW BHI control and the SW BHB clearing sequence. - on - unconditionally enable. - off - unconditionally disable. - auto - (default) enable hardware mitigation - (BHI_DIS_S) if available, otherwise enable - alternate mitigation in KVM. + on - (default) Enable the HW or SW mitigation + as needed. + off - Disable the mitigation. + auto - Enable the HW mitigation if needed, but + *don't* enable the SW mitigation except + for KVM. The system may be vulnerable. spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. From cb2db5bb04d7f778fbc1a1ea2507aab436f1bff3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:46 -0700 Subject: [PATCH 147/270] x86/bugs: Cache the value of MSR_IA32_ARCH_CAPABILITIES There's no need to keep reading MSR_IA32_ARCH_CAPABILITIES over and over. It's even read in the BHI sysfs function which is a big no-no. Just read it once and cache it. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/9592a18a814368e75f8f4b9d74d3883aa4fd1eaf.1712813475.git.jpoimboe@kernel.org --- arch/x86/kernel/cpu/bugs.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 27f5004a8e24..ff59fa8bb610 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -61,6 +61,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); +static u64 __ro_after_init ia32_cap; + static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; @@ -144,6 +146,8 @@ void __init cpu_select_mitigations(void) x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } + ia32_cap = x86_read_arch_cap_msr(); + /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); @@ -301,8 +305,6 @@ static const char * const taa_strings[] = { static void __init taa_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_TAA)) { taa_mitigation = TAA_MITIGATION_OFF; return; @@ -341,7 +343,6 @@ static void __init taa_select_mitigation(void) * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode * update is required. */ - ia32_cap = x86_read_arch_cap_msr(); if ( (ia32_cap & ARCH_CAP_MDS_NO) && !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; @@ -401,8 +402,6 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { @@ -413,8 +412,6 @@ static void __init mmio_select_mitigation(void) if (mmio_mitigation == MMIO_MITIGATION_OFF) return; - ia32_cap = x86_read_arch_cap_msr(); - /* * Enable CPU buffer clear mitigation for host and VMM, if also affected * by MDS or TAA. Otherwise, enable mitigation for VMM only. @@ -508,7 +505,7 @@ static void __init rfds_select_mitigation(void) if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (x86_read_arch_cap_msr() & ARCH_CAP_RFDS_CLEAR) + if (ia32_cap & ARCH_CAP_RFDS_CLEAR) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; @@ -659,8 +656,6 @@ void update_srbds_msr(void) static void __init srbds_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return; @@ -669,7 +664,6 @@ static void __init srbds_select_mitigation(void) * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ - ia32_cap = x86_read_arch_cap_msr(); if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; @@ -813,7 +807,7 @@ static void __init gds_select_mitigation(void) /* Will verify below that mitigation _can_ be disabled */ /* No microcode */ - if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + if (!(ia32_cap & ARCH_CAP_GDS_CTRL)) { if (gds_mitigation == GDS_MITIGATION_FORCE) { /* * This only needs to be done on the boot CPU so do it @@ -1908,8 +1902,6 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { - u64 ia32_cap = x86_read_arch_cap_msr(); - /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -2818,7 +2810,7 @@ static const char *spectre_bhi_state(void) else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && - !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) + !(ia32_cap & ARCH_CAP_RRSBA)) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Syscall hardening, KVM: SW loop"; From d0485730d2189ffe5d986d4e9e191f1e4d5ffd24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Apr 2024 09:25:36 +0200 Subject: [PATCH 148/270] x86/bugs: Rename various 'ia32_cap' variables to 'x86_arch_cap_msr' So we are using the 'ia32_cap' value in a number of places, which got its name from MSR_IA32_ARCH_CAPABILITIES MSR register. But there's very little 'IA32' about it - this isn't 32-bit only code, nor does it originate from there, it's just a historic quirk that many Intel MSR names are prefixed with IA32_. This is already clear from the helper method around the MSR: x86_read_arch_cap_msr(), which doesn't have the IA32 prefix. So rename 'ia32_cap' to 'x86_arch_cap_msr' to be consistent with its role and with the naming of the helper function. Signed-off-by: Ingo Molnar Cc: Josh Poimboeuf Cc: Nikolay Borisov Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/9592a18a814368e75f8f4b9d74d3883aa4fd1eaf.1712813475.git.jpoimboe@kernel.org --- arch/x86/kernel/apic/apic.c | 6 ++--- arch/x86/kernel/cpu/bugs.c | 30 +++++++++++----------- arch/x86/kernel/cpu/common.c | 48 ++++++++++++++++++------------------ 3 files changed, 42 insertions(+), 42 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a42d8a6f7149..c342c4aa9c68 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1687,11 +1687,11 @@ static int x2apic_state; static bool x2apic_hw_locked(void) { - u64 ia32_cap; + u64 x86_arch_cap_msr; u64 msr; - ia32_cap = x86_read_arch_cap_msr(); - if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) { rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); return (msr & LEGACY_XAPIC_DISABLED); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ff59fa8bb610..1b0cfc136432 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -61,7 +61,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); -static u64 __ro_after_init ia32_cap; +static u64 __ro_after_init x86_arch_cap_msr; static DEFINE_MUTEX(spec_ctrl_mutex); @@ -146,7 +146,7 @@ void __init cpu_select_mitigations(void) x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } - ia32_cap = x86_read_arch_cap_msr(); + x86_arch_cap_msr = x86_read_arch_cap_msr(); /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); @@ -343,8 +343,8 @@ static void __init taa_select_mitigation(void) * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode * update is required. */ - if ( (ia32_cap & ARCH_CAP_MDS_NO) && - !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; /* @@ -434,7 +434,7 @@ static void __init mmio_select_mitigation(void) * be propagated to uncore buffers, clearing the Fill buffers on idle * is required irrespective of SMT state. */ - if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); /* @@ -444,10 +444,10 @@ static void __init mmio_select_mitigation(void) * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS * affected systems. */ - if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || (boot_cpu_has(X86_FEATURE_MD_CLEAR) && boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(ia32_cap & ARCH_CAP_MDS_NO))) + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) mmio_mitigation = MMIO_MITIGATION_VERW; else mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; @@ -505,7 +505,7 @@ static void __init rfds_select_mitigation(void) if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (ia32_cap & ARCH_CAP_RFDS_CLEAR) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; @@ -664,7 +664,7 @@ static void __init srbds_select_mitigation(void) * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ - if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) @@ -807,7 +807,7 @@ static void __init gds_select_mitigation(void) /* Will verify below that mitigation _can_ be disabled */ /* No microcode */ - if (!(ia32_cap & ARCH_CAP_GDS_CTRL)) { + if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { if (gds_mitigation == GDS_MITIGATION_FORCE) { /* * This only needs to be done on the boot CPU so do it @@ -1541,14 +1541,14 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { - u64 ia32_cap; + u64 x86_arch_cap_msr; if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) return; - ia32_cap = x86_read_arch_cap_msr(); + x86_arch_cap_msr = x86_read_arch_cap_msr(); - if (ia32_cap & ARCH_CAP_RRSBA) { + if (x86_arch_cap_msr & ARCH_CAP_RRSBA) { x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; update_spec_ctrl(x86_spec_ctrl_base); } @@ -1916,7 +1916,7 @@ static void update_mds_branch_idle(void) if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); } else if (mmio_mitigation == MMIO_MITIGATION_OFF || - (ia32_cap & ARCH_CAP_FBSDP_NO)) { + (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); } } @@ -2810,7 +2810,7 @@ static const char *spectre_bhi_state(void) else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && - !(ia32_cap & ARCH_CAP_RRSBA)) + !(x86_arch_cap_msr & ARCH_CAP_RRSBA)) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Syscall hardening, KVM: SW loop"; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 754d91857d63..605c26c009c8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1284,25 +1284,25 @@ static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long whi u64 x86_read_arch_cap_msr(void) { - u64 ia32_cap = 0; + u64 x86_arch_cap_msr = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); - return ia32_cap; + return x86_arch_cap_msr; } -static bool arch_cap_mmio_immune(u64 ia32_cap) +static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr) { - return (ia32_cap & ARCH_CAP_FBSDP_NO && - ia32_cap & ARCH_CAP_PSDP_NO && - ia32_cap & ARCH_CAP_SBDR_SSDP_NO); + return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO && + x86_arch_cap_msr & ARCH_CAP_PSDP_NO && + x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO); } -static bool __init vulnerable_to_rfds(u64 ia32_cap) +static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) { /* The "immunity" bit trumps everything else: */ - if (ia32_cap & ARCH_CAP_RFDS_NO) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO) return false; /* @@ -1310,7 +1310,7 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap) * indicate that mitigation is needed because guest is running on a * vulnerable hardware or may migrate to such hardware: */ - if (ia32_cap & ARCH_CAP_RFDS_CLEAR) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) return true; /* Only consult the blacklist when there is no enumeration: */ @@ -1319,11 +1319,11 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap) static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { - u64 ia32_cap = x86_read_arch_cap_msr(); + u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && - !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) @@ -1335,7 +1335,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && - !(ia32_cap & ARCH_CAP_SSB_NO) && + !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); @@ -1346,17 +1346,17 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Don't use AutoIBRS when SNP is enabled because it degrades host * userspace indirect branch performance. */ - if ((ia32_cap & ARCH_CAP_IBRS_ALL) || + if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) || (cpu_has(c, X86_FEATURE_AUTOIBRS) && !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO)) setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && - !(ia32_cap & ARCH_CAP_MDS_NO)) { + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); @@ -1375,9 +1375,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * TSX_CTRL check alone is not sufficient for cases when the microcode * update is not present or running as guest that don't get TSX_CTRL. */ - if (!(ia32_cap & ARCH_CAP_TAA_NO) && + if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) && (cpu_has(c, X86_FEATURE_RTM) || - (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); /* @@ -1403,7 +1403,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ - if (!arch_cap_mmio_immune(ia32_cap)) { + if (!arch_cap_mmio_immune(x86_arch_cap_msr)) { if (cpu_matches(cpu_vuln_blacklist, MMIO)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) @@ -1411,7 +1411,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { - if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA)) setup_force_cpu_bug(X86_BUG_RETBLEED); } @@ -1429,15 +1429,15 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * disabling AVX2. The only way to do this in HW is to clear XCR0[2], * which means that AVX will be disabled. */ - if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) && boot_cpu_has(X86_FEATURE_AVX)) setup_force_cpu_bug(X86_BUG_GDS); - if (vulnerable_to_rfds(ia32_cap)) + if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); /* When virtualized, eIBRS could be hidden, assume vulnerable */ - if (!(ia32_cap & ARCH_CAP_BHI_NO) && + if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && !cpu_matches(cpu_vuln_whitelist, NO_BHI) && (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || boot_cpu_has(X86_FEATURE_HYPERVISOR))) @@ -1447,7 +1447,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) return; /* Rogue Data Cache Load? No! */ - if (ia32_cap & ARCH_CAP_RDCL_NO) + if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO) return; setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); From 1cea8a280dfd1016148a3820676f2f03e3f5b898 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:47 -0700 Subject: [PATCH 149/270] x86/bugs: Fix BHI handling of RRSBA The ARCH_CAP_RRSBA check isn't correct: RRSBA may have already been disabled by the Spectre v2 mitigation (or can otherwise be disabled by the BHI mitigation itself if needed). In that case retpolines are fine. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/6f56f13da34a0834b69163467449be7f58f253dc.1712813475.git.jpoimboe@kernel.org --- arch/x86/kernel/cpu/bugs.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1b0cfc136432..08dfb94fcb3a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1538,20 +1538,25 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_RETPOLINE; } +static bool __ro_after_init rrsba_disabled; + /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { - u64 x86_arch_cap_msr; + if (rrsba_disabled) + return; + + if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) { + rrsba_disabled = true; + return; + } if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) return; - x86_arch_cap_msr = x86_read_arch_cap_msr(); - - if (x86_arch_cap_msr & ARCH_CAP_RRSBA) { - x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; - update_spec_ctrl(x86_spec_ctrl_base); - } + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + rrsba_disabled = true; } static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) @@ -1652,9 +1657,11 @@ static void __init bhi_select_mitigation(void) return; /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) - return; + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { + spec_ctrl_disable_kernel_rrsba(); + if (rrsba_disabled) + return; + } if (spec_ctrl_bhi_dis()) return; @@ -2809,8 +2816,7 @@ static const char *spectre_bhi_state(void) return "; BHI: BHI_DIS_S"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; - else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && - !(x86_arch_cap_msr & ARCH_CAP_RRSBA)) + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && rrsba_disabled) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Syscall hardening, KVM: SW loop"; From 5f882f3b0a8bf0788d5a0ee44b1191de5319bb8a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:48 -0700 Subject: [PATCH 150/270] x86/bugs: Clarify that syscall hardening isn't a BHI mitigation While syscall hardening helps prevent some BHI attacks, there's still other low-hanging fruit remaining. Don't classify it as a mitigation and make it clear that the system may still be vulnerable if it doesn't have a HW or SW mitigation enabled. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/b5951dae3fdee7f1520d5136a27be3bdfe95f88b.1712813475.git.jpoimboe@kernel.org --- Documentation/admin-guide/hw-vuln/spectre.rst | 11 +++++------ Documentation/admin-guide/kernel-parameters.txt | 3 +-- arch/x86/kernel/cpu/bugs.c | 6 +++--- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 3cf18e4a1d9a..5a39acf82483 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -441,10 +441,10 @@ The possible values in this file are: - System is protected by BHI_DIS_S * - BHI: SW loop, KVM SW loop - System is protected by software clearing sequence - * - BHI: Syscall hardening - - Syscalls are hardened against BHI - * - BHI: Syscall hardening, KVM: SW loop - - System is protected from userspace attacks by syscall hardening; KVM is protected by software clearing sequence + * - BHI: Vulnerable + - System is vulnerable to BHI + * - BHI: Vulnerable, KVM: SW loop + - System is vulnerable; KVM is protected by software clearing sequence Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will @@ -661,8 +661,7 @@ kernel command line. spectre_bhi= [X86] Control mitigation of Branch History Injection - (BHI) vulnerability. Syscalls are hardened against BHI - regardless of this setting. This setting affects the deployment + (BHI) vulnerability. This setting affects the deployment of the HW BHI control and the SW BHB clearing sequence. on diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a029ad6c4963..a3874cc97892 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6065,8 +6065,7 @@ See Documentation/admin-guide/laptops/sonypi.rst spectre_bhi= [X86] Control mitigation of Branch History Injection - (BHI) vulnerability. Syscalls are hardened against BHI - reglardless of this setting. This setting affects the + (BHI) vulnerability. This setting affects the deployment of the HW BHI control and the SW BHB clearing sequence. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 08dfb94fcb3a..9eeb60f5fbb3 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2818,10 +2818,10 @@ static const char *spectre_bhi_state(void) return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && rrsba_disabled) return "; BHI: Retpoline"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) - return "; BHI: Syscall hardening, KVM: SW loop"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + return "; BHI: Vulnerable, KVM: SW loop"; - return "; BHI: Vulnerable (Syscall hardening enabled)"; + return "; BHI: Vulnerable"; } static ssize_t spectre_v2_show_state(char *buf) From f969eb84ce482331a991079ab7a5c4dc3b7f89bf Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 7 Apr 2024 14:56:04 +0800 Subject: [PATCH 151/270] netfilter: nf_tables: Fix potential data-race in __nft_expr_type_get() nft_unregister_expr() can concurrent with __nft_expr_type_get(), and there is not any protection when iterate over nf_tables_expressions list in __nft_expr_type_get(). Therefore, there is potential data-race of nf_tables_expressions list entry. Use list_for_each_entry_rcu() to iterate over nf_tables_expressions list in __nft_expr_type_get(), and use rcu_read_lock() in the caller nft_expr_type_get() to protect the entire type query process. Fixes: ef1f7df9170d ("netfilter: nf_tables: expression ops overloading") Signed-off-by: Ziyang Xuan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d89d77946719..53b8c00863ad 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3060,7 +3060,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, { const struct nft_expr_type *type, *candidate = NULL; - list_for_each_entry(type, &nf_tables_expressions, list) { + list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; @@ -3092,9 +3092,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); + rcu_read_lock(); type = __nft_expr_type_get(family, nla); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES From d78d867dcea69c328db30df665be5be7d0148484 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 7 Apr 2024 14:56:05 +0800 Subject: [PATCH 152/270] netfilter: nf_tables: Fix potential data-race in __nft_obj_type_get() nft_unregister_obj() can concurrent with __nft_obj_type_get(), and there is not any protection when iterate over nf_tables_objects list in __nft_obj_type_get(). Therefore, there is potential data-race of nf_tables_objects list entry. Use list_for_each_entry_rcu() to iterate over nf_tables_objects list in __nft_obj_type_get(), and use rcu_read_lock() in the caller nft_obj_type_get() to protect the entire type query process. Fixes: e50092404c1b ("netfilter: nf_tables: add stateful objects") Signed-off-by: Ziyang Xuan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 53b8c00863ad..f11d0c0a2c73 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7611,7 +7611,7 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; - list_for_each_entry(type, &nf_tables_objects, list) { + list_for_each_entry_rcu(type, &nf_tables_objects, list) { if (type->family != NFPROTO_UNSPEC && type->family != family) continue; @@ -7627,9 +7627,13 @@ nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; + rcu_read_lock(); type = __nft_obj_type_get(objtype, family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES From 751de2012eafa4d46d8081056761fa0e9cc8a178 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Apr 2024 11:24:59 +0200 Subject: [PATCH 153/270] netfilter: br_netfilter: skip conntrack input hook for promisc packets For historical reasons, when bridge device is in promisc mode, packets that are directed to the taps follow bridge input hook path. This patch adds a workaround to reset conntrack for these packets. Jianbo Liu reports warning splats in their test infrastructure where cloned packets reach the br_netfilter input hook to confirm the conntrack object. Scratch one bit from BR_INPUT_SKB_CB to annotate that this packet has reached the input hook because it is passed up to the bridge device to reach the taps. [ 57.571874] WARNING: CPU: 1 PID: 0 at net/bridge/br_netfilter_hooks.c:616 br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.572749] Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_isc si ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5ctl mlx5_core [ 57.575158] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.8.0+ #19 [ 57.575700] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 57.576662] RIP: 0010:br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.577195] Code: fe ff ff 41 bd 04 00 00 00 be 04 00 00 00 e9 4a ff ff ff be 04 00 00 00 48 89 ef e8 f3 a9 3c e1 66 83 ad b4 00 00 00 04 eb 91 <0f> 0b e9 f1 fe ff ff 0f 0b e9 df fe ff ff 48 89 df e8 b3 53 47 e1 [ 57.578722] RSP: 0018:ffff88885f845a08 EFLAGS: 00010202 [ 57.579207] RAX: 0000000000000002 RBX: ffff88812dfe8000 RCX: 0000000000000000 [ 57.579830] RDX: ffff88885f845a60 RSI: ffff8881022dc300 RDI: 0000000000000000 [ 57.580454] RBP: ffff88885f845a60 R08: 0000000000000001 R09: 0000000000000003 [ 57.581076] R10: 00000000ffff1300 R11: 0000000000000002 R12: 0000000000000000 [ 57.581695] R13: ffff8881047ffe00 R14: ffff888108dbee00 R15: ffff88814519b800 [ 57.582313] FS: 0000000000000000(0000) GS:ffff88885f840000(0000) knlGS:0000000000000000 [ 57.583040] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 57.583564] CR2: 000000c4206aa000 CR3: 0000000103847001 CR4: 0000000000370eb0 [ 57.584194] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 57.584820] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 57.585440] Call Trace: [ 57.585721] [ 57.585976] ? __warn+0x7d/0x130 [ 57.586323] ? br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.586811] ? report_bug+0xf1/0x1c0 [ 57.587177] ? handle_bug+0x3f/0x70 [ 57.587539] ? exc_invalid_op+0x13/0x60 [ 57.587929] ? asm_exc_invalid_op+0x16/0x20 [ 57.588336] ? br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.588825] nf_hook_slow+0x3d/0xd0 [ 57.589188] ? br_handle_vlan+0x4b/0x110 [ 57.589579] br_pass_frame_up+0xfc/0x150 [ 57.589970] ? br_port_flags_change+0x40/0x40 [ 57.590396] br_handle_frame_finish+0x346/0x5e0 [ 57.590837] ? ipt_do_table+0x32e/0x430 [ 57.591221] ? br_handle_local_finish+0x20/0x20 [ 57.591656] br_nf_hook_thresh+0x4b/0xf0 [br_netfilter] [ 57.592286] ? br_handle_local_finish+0x20/0x20 [ 57.592802] br_nf_pre_routing_finish+0x178/0x480 [br_netfilter] [ 57.593348] ? br_handle_local_finish+0x20/0x20 [ 57.593782] ? nf_nat_ipv4_pre_routing+0x25/0x60 [nf_nat] [ 57.594279] br_nf_pre_routing+0x24c/0x550 [br_netfilter] [ 57.594780] ? br_nf_hook_thresh+0xf0/0xf0 [br_netfilter] [ 57.595280] br_handle_frame+0x1f3/0x3d0 [ 57.595676] ? br_handle_local_finish+0x20/0x20 [ 57.596118] ? br_handle_frame_finish+0x5e0/0x5e0 [ 57.596566] __netif_receive_skb_core+0x25b/0xfc0 [ 57.597017] ? __napi_build_skb+0x37/0x40 [ 57.597418] __netif_receive_skb_list_core+0xfb/0x220 Fixes: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack") Reported-by: Jianbo Liu Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_input.c | 15 +++++++++++---- net/bridge/br_netfilter_hooks.c | 6 ++++++ net/bridge/br_private.h | 1 + net/bridge/netfilter/nf_conntrack_bridge.c | 14 ++++++++++---- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index f21097e73482..ceaa5a89b947 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -30,7 +30,7 @@ br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) return netif_receive_skb(skb); } -static int br_pass_frame_up(struct sk_buff *skb) +static int br_pass_frame_up(struct sk_buff *skb, bool promisc) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); @@ -65,6 +65,8 @@ static int br_pass_frame_up(struct sk_buff *skb) br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); + BR_INPUT_SKB_CB(skb)->promisc = promisc; + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, br_netif_receive_skb); @@ -82,6 +84,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_mcast *brmctx; struct net_bridge_vlan *vlan; struct net_bridge *br; + bool promisc; u16 vid = 0; u8 state; @@ -137,7 +140,9 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); - local_rcv = !!(br->dev->flags & IFF_PROMISC); + promisc = !!(br->dev->flags & IFF_PROMISC); + local_rcv = promisc; + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { @@ -200,7 +205,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb unsigned long now = jiffies; if (test_bit(BR_FDB_LOCAL, &dst->flags)) - return br_pass_frame_up(skb); + return br_pass_frame_up(skb, false); if (now != dst->used) dst->used = now; @@ -213,7 +218,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } if (local_rcv) - return br_pass_frame_up(skb); + return br_pass_frame_up(skb, promisc); out: return 0; @@ -386,6 +391,8 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) goto forward; } + BR_INPUT_SKB_CB(skb)->promisc = false; + /* The else clause should be hit when nf_hook(): * - returns < 0 (drop/error) * - returns = 0 (stolen/nf_queue) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 35e10c5a766d..22e35623c148 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -600,11 +600,17 @@ static unsigned int br_nf_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + bool promisc = BR_INPUT_SKB_CB(skb)->promisc; struct nf_conntrack *nfct = skb_nfct(skb); const struct nf_ct_hook *ct_hook; struct nf_conn *ct; int ret; + if (promisc) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 86ea5e6689b5..d4bedc87b1d8 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -589,6 +589,7 @@ struct br_input_skb_cb { #endif u8 proxyarp_replied:1; u8 src_port_isolated:1; + u8 promisc:1; #ifdef CONFIG_BRIDGE_VLAN_FILTERING u8 vlan_filtered:1; #endif diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 6f877e31709b..c3c51b9a6826 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -294,18 +294,24 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - enum ip_conntrack_info ctinfo; + bool promisc = BR_INPUT_SKB_CB(skb)->promisc; + struct nf_conntrack *nfct = skb_nfct(skb); struct nf_conn *ct; - if (skb->pkt_type == PACKET_HOST) + if (promisc) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + + if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; /* nf_conntrack_confirm() cannot handle concurrent clones, * this happens for broad/multicast frames with e.g. macvlan on top * of the bridge device. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) + ct = container_of(nfct, struct nf_conn, ct_general); + if (nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) return NF_ACCEPT; /* let inet prerouting call conntrack again */ From 29b359cf6d95fd60730533f7f10464e95bd17c73 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 10 Apr 2024 18:50:45 +0200 Subject: [PATCH 154/270] netfilter: nft_set_pipapo: walk over current view on netlink dump The generation mask can be updated while netlink dump is in progress. The pipapo set backend walk iterator cannot rely on it to infer what view of the datastructure is to be used. Add notation to specify if user wants to read/update the set. Based on patch from Florian Westphal. Fixes: 2b84e215f874 ("netfilter: nft_set_pipapo: .walk does not deal with generations") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 14 ++++++++++++++ net/netfilter/nf_tables_api.c | 6 ++++++ net/netfilter/nft_set_pipapo.c | 5 +++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e27c28b612e4..3f1ed467f951 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -307,9 +307,23 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) return (void *)priv; } + +/** + * enum nft_iter_type - nftables set iterator type + * + * @NFT_ITER_READ: read-only iteration over set elements + * @NFT_ITER_UPDATE: iteration under mutex to update set element state + */ +enum nft_iter_type { + NFT_ITER_UNSPEC, + NFT_ITER_READ, + NFT_ITER_UPDATE, +}; + struct nft_set; struct nft_set_iter { u8 genmask; + enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f11d0c0a2c73..a7a34db62ea9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -626,6 +626,7 @@ static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_deactivate, }; @@ -5445,6 +5446,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, } iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; @@ -5518,6 +5520,7 @@ static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_activate, }; @@ -5892,6 +5895,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.skb = skb; args.reset = dump_ctx->reset; args.iter.genmask = nft_genmask_cur(net); + args.iter.type = NFT_ITER_READ; args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; @@ -7376,6 +7380,7 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { .genmask = genmask, + .type = NFT_ITER_UPDATE, .fn = nft_setelem_flush, }; @@ -10879,6 +10884,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, continue; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index df8de5090246..11e44e4dfb1f 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2115,13 +2115,14 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); - struct net *net = read_pnet(&set->net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; unsigned int i, r; + WARN_ON_ONCE(iter->type == NFT_ITER_UNSPEC); + rcu_read_lock(); - if (iter->genmask == nft_genmask_cur(net)) + if (iter->type == NFT_ITER_READ) m = rcu_dereference(priv->match); else m = priv->clone; From 3cfc9ec039af60dbd8965ae085b2c2ccdcfbe1cc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Apr 2024 21:05:13 +0200 Subject: [PATCH 155/270] netfilter: nft_set_pipapo: do not free live element Pablo reports a crash with large batches of elements with a back-to-back add/remove pattern. Quoting Pablo: add_elem("00000000") timeout 100 ms ... add_elem("0000000X") timeout 100 ms del_elem("0000000X") <---------------- delete one that was just added ... add_elem("00005000") timeout 100 ms 1) nft_pipapo_remove() removes element 0000000X Then, KASAN shows a splat. Looking at the remove function there is a chance that we will drop a rule that maps to a non-deactivated element. Removal happens in two steps, first we do a lookup for key k and return the to-be-removed element and mark it as inactive in the next generation. Then, in a second step, the element gets removed from the set/map. The _remove function does not work correctly if we have more than one element that share the same key. This can happen if we insert an element into a set when the set already holds an element with same key, but the element mapping to the existing key has timed out or is not active in the next generation. In such case its possible that removal will unmap the wrong element. If this happens, we will leak the non-deactivated element, it becomes unreachable. The element that got deactivated (and will be freed later) will remain reachable in the set data structure, this can result in a crash when such an element is retrieved during lookup (stale pointer). Add a check that the fully matching key does in fact map to the element that we have marked as inactive in the deactivation step. If not, we need to continue searching. Add a bug/warn trap at the end of the function as well, the remove function must not ever be called with an invisible/unreachable/non-existent element. v2: avoid uneeded temporary variable (Stefano) Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reported-by: Pablo Neira Ayuso Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 11e44e4dfb1f..eeaf05ffba95 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2077,6 +2077,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + if (!pipapo_match_field(f, start, rules_fx, match_start, match_end)) break; @@ -2089,16 +2091,18 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); - } - if (i == m->field_count) { - priv->dirty = true; - pipapo_drop(m, rulemap); - return; + if (last && f->mt[rulemap[i].to].e == e) { + priv->dirty = true; + pipapo_drop(m, rulemap); + return; + } } first_rule += rules_f0; } + + WARN_ON_ONCE(1); /* elem_priv not found */ } /** From 87b3593bed1868b2d9fe096c01bcdf0ea86cbebf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Apr 2024 13:47:33 +0200 Subject: [PATCH 156/270] netfilter: flowtable: validate pppoe header Ensure there is sufficient room to access the protocol field of the PPPoe header. Validate it once before the flowtable lookup, then use a helper function to access protocol field. Reported-by: syzbot+b6f07e1c07ef40199081@syzkaller.appspotmail.com Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 12 +++++++++++- net/netfilter/nf_flow_table_inet.c | 3 ++- net/netfilter/nf_flow_table_ip.c | 8 +++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a763dd327c6e..9abb7ee40d72 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -336,7 +336,7 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); -static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) { __be16 proto; @@ -352,6 +352,16 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) return 0; } +static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) +{ + if (!pskb_may_pull(skb, PPPOE_SES_HLEN)) + return false; + + *inner_proto = __nf_flow_pppoe_proto(skb); + + return true; +} + #define NF_FLOW_TABLE_STAT_INC(net, count) __this_cpu_inc((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_DEC(net, count) __this_cpu_dec((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count) \ diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 9505f9d188ff..6eef15648b7b 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -21,7 +21,8 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, proto = veth->h_vlan_encapsulated_proto; break; case htons(ETH_P_PPP_SES): - proto = nf_flow_pppoe_proto(skb); + if (!nf_flow_pppoe_proto(skb, &proto)) + return NF_ACCEPT; break; default: proto = skb->protocol; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index e45fade76409..9e9e105052da 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -273,10 +273,11 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, +static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; + __be16 inner_proto; switch (skb->protocol) { case htons(ETH_P_8021Q): @@ -287,7 +288,8 @@ static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, } break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_proto(skb) == proto) { + if (nf_flow_pppoe_proto(skb, &inner_proto) && + inner_proto == proto) { *offset += PPPOE_SES_HLEN; return true; } @@ -316,7 +318,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): - skb->protocol = nf_flow_pppoe_proto(skb); + skb->protocol = __nf_flow_pppoe_proto(skb); skb_pull(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; From 6db5dc7b351b9569940cd1cf445e237c42cd6d27 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Apr 2024 00:09:00 +0200 Subject: [PATCH 157/270] netfilter: flowtable: incorrect pppoe tuple pppoe traffic reaching ingress path does not match the flowtable entry because the pppoe header is expected to be at the network header offset. This bug causes a mismatch in the flow table lookup, so pppoe packets enter the classical forwarding path. Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 9e9e105052da..5383bed3d3e0 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -157,7 +157,7 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, tuple->encap[i].proto = skb->protocol; break; case htons(ETH_P_PPP_SES): - phdr = (struct pppoe_hdr *)skb_mac_header(skb); + phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; break; From f488138b526715c6d2568d7329c4477911be4210 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 11 Apr 2024 11:45:57 +0200 Subject: [PATCH 158/270] NFSD: fix endianness issue in nfsd4_encode_fattr4 The nfs4 mount fails with EIO on 64-bit big endian architectures since v6.7. The issue arises from employing a union in the nfsd4_encode_fattr4() function to overlay a 32-bit array with a 64-bit values based bitmap, which does not function as intended. Address the endianness issue by utilizing bitmap_from_arr32() to copy 32-bit attribute masks into a bitmap in an endianness-agnostic manner. Cc: stable@vger.kernel.org Fixes: fce7913b13d0 ("NFSD: Use a bitmask loop to encode FATTR4 results") Link: https://bugs.launchpad.net/ubuntu/+source/nfs-utils/+bug/2060217 Signed-off-by: Vasily Gorbik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index fac938f563ad..1955481832e0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3490,11 +3490,13 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct dentry *dentry, const u32 *bmval, int ignore_crossmnt) { + DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); struct nfsd4_fattr_args args; struct svc_fh *tempfh = NULL; int starting_len = xdr->buf->len; __be32 *attrlen_p, status; int attrlen_offset; + u32 attrmask[3]; int err; struct nfsd4_compoundres *resp = rqstp->rq_resp; u32 minorversion = resp->cstate.minorversion; @@ -3502,10 +3504,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, .mnt = exp->ex_path.mnt, .dentry = dentry, }; - union { - u32 attrmask[3]; - unsigned long mask[2]; - } u; unsigned long bit; bool file_modified = false; u64 size = 0; @@ -3521,20 +3519,19 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, /* * Make a local copy of the attribute bitmap that can be modified. */ - memset(&u, 0, sizeof(u)); - u.attrmask[0] = bmval[0]; - u.attrmask[1] = bmval[1]; - u.attrmask[2] = bmval[2]; + attrmask[0] = bmval[0]; + attrmask[1] = bmval[1]; + attrmask[2] = bmval[2]; args.rdattr_err = 0; if (exp->ex_fslocs.migrated) { - status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1], - &u.attrmask[2], &args.rdattr_err); + status = fattr_handle_absent_fs(&attrmask[0], &attrmask[1], + &attrmask[2], &args.rdattr_err); if (status) goto out; } args.size = 0; - if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { + if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), &file_modified, &size); if (status) @@ -3553,16 +3550,16 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (!(args.stat.result_mask & STATX_BTIME)) /* underlying FS does not offer btime so we can't share it */ - u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; - if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; + if ((attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || - (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | + (attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { err = vfs_statfs(&path, &args.statfs); if (err) goto out_nfserr; } - if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && + if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); status = nfserr_jukebox; @@ -3577,10 +3574,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, args.fhp = fhp; args.acl = NULL; - if (u.attrmask[0] & FATTR4_WORD0_ACL) { + if (attrmask[0] & FATTR4_WORD0_ACL) { err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl); if (err == -EOPNOTSUPP) - u.attrmask[0] &= ~FATTR4_WORD0_ACL; + attrmask[0] &= ~FATTR4_WORD0_ACL; else if (err == -EINVAL) { status = nfserr_attrnotsupp; goto out; @@ -3592,17 +3589,17 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL args.context = NULL; - if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || - u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { + if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || + attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { if (exp->ex_flags & NFSEXP_SECURITY_LABEL) err = security_inode_getsecctx(d_inode(dentry), &args.context, &args.contextlen); else err = -EOPNOTSUPP; args.contextsupport = (err == 0); - if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { if (err == -EOPNOTSUPP) - u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; else if (err) goto out_nfserr; } @@ -3610,8 +3607,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ /* attrmask */ - status = nfsd4_encode_bitmap4(xdr, u.attrmask[0], - u.attrmask[1], u.attrmask[2]); + status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1], + attrmask[2]); if (status) goto out; @@ -3620,7 +3617,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, attrlen_p = xdr_reserve_space(xdr, XDR_UNIT); if (!attrlen_p) goto out_resource; - for_each_set_bit(bit, (const unsigned long *)&u.mask, + bitmap_from_arr32(attr_bitmap, attrmask, + ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); + for_each_set_bit(bit, attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) { status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args); if (status != nfs_ok) From 50a9b7fc151e67b9e642232d32e8c5a5ac13e64a Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 5 Apr 2024 13:07:11 -0700 Subject: [PATCH 159/270] drm/xe/display: Fix double mutex initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All of these mutexes are already initialized by the display side since commit 3fef3e6ff86a ("drm/i915: move display mutex inits to display code"), so the xe shouldn´t initialize them. Fixes: 44e694958b95 ("drm/xe/display: Implement display support") Cc: Jani Nikula Cc: Arun R Murthy Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20240405200711.2041428-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 117de185edf2c5767f03575219bf7a43b161ff0d) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/display/xe_display.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index e4db069f0db3..6ec375c1c4b6 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -108,11 +108,6 @@ int xe_display_create(struct xe_device *xe) xe->display.hotplug.dp_wq = alloc_ordered_workqueue("xe-dp", 0); drmm_mutex_init(&xe->drm, &xe->sb_lock); - drmm_mutex_init(&xe->drm, &xe->display.backlight.lock); - drmm_mutex_init(&xe->drm, &xe->display.audio.mutex); - drmm_mutex_init(&xe->drm, &xe->display.wm.wm_mutex); - drmm_mutex_init(&xe->drm, &xe->display.pps.mutex); - drmm_mutex_init(&xe->drm, &xe->display.hdcp.hdcp_mutex); xe->enabled_irq_mask = ~0; err = drmm_add_action_or_reset(&xe->drm, display_destroy, NULL); From a8ad8715472bb8f6a2ea8b4072a28151eb9f4f24 Mon Sep 17 00:00:00 2001 From: Karthik Poosa Date: Fri, 5 Apr 2024 18:31:27 +0530 Subject: [PATCH 160/270] drm/xe/hwmon: Cast result to output precision on left shift of operand Address potential overflow in result of left shift of a lower precision (u32) operand before assignment to higher precision (u64) variable. v2: - Update commit message. (Himal) Fixes: 4446fcf220ce ("drm/xe/hwmon: Expose power1_max_interval") Signed-off-by: Karthik Poosa Reviewed-by: Anshuman Gupta Cc: Badal Nilawar Link: https://patchwork.freedesktop.org/patch/msgid/20240405130127.1392426-5-karthik.poosa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 883232b47b81108b0252197c747f396ecd51455a) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_hwmon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index b82233a41606..9ac7fbe201b3 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -290,7 +290,7 @@ xe_hwmon_power1_max_interval_show(struct device *dev, struct device_attribute *a * As y can be < 2, we compute tau4 = (4 | x) << y * and then add 2 when doing the final right shift to account for units */ - tau4 = ((1 << x_w) | x) << y; + tau4 = (u64)((1 << x_w) | x) << y; /* val in hwmon interface units (millisec) */ out = mul_u64_u32_shr(tau4, SF_TIME, hwmon->scl_shift_time + x_w); @@ -330,7 +330,7 @@ xe_hwmon_power1_max_interval_store(struct device *dev, struct device_attribute * r = FIELD_PREP(PKG_MAX_WIN, PKG_MAX_WIN_DEFAULT); x = REG_FIELD_GET(PKG_MAX_WIN_X, r); y = REG_FIELD_GET(PKG_MAX_WIN_Y, r); - tau4 = ((1 << x_w) | x) << y; + tau4 = (u64)((1 << x_w) | x) << y; max_win = mul_u64_u32_shr(tau4, SF_TIME, hwmon->scl_shift_time + x_w); if (val > max_win) From 9cb46b31f3d08ed3fce86349e8c12f96d7c88717 Mon Sep 17 00:00:00 2001 From: Himal Prasad Ghimiray Date: Mon, 1 Apr 2024 23:23:00 +0530 Subject: [PATCH 161/270] drm/xe/xe_migrate: Cast to output precision before multiplying operands Addressing potential overflow in result of multiplication of two lower precision (u32) operands before widening it to higher precision (u64). -v2 Fix commit message and description. (Rodrigo) Cc: Rodrigo Vivi Signed-off-by: Himal Prasad Ghimiray Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240401175300.3823653-1-himal.prasad.ghimiray@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 34820967ae7b45411f8f4f737c2d63b0c608e0d7) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_migrate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index ee1bb938c493..2ba4fb9511f6 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -227,7 +227,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, if (vm->flags & XE_VM_FLAG_64K && level == 1) flags = XE_PDE_64K; - entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (level - 1) * + entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (u64)(level - 1) * XE_PAGE_SIZE, pat_index); xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, entry | flags); @@ -235,7 +235,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, /* Write PDE's that point to our BO. */ for (i = 0; i < num_entries - num_level; i++) { - entry = vm->pt_ops->pde_encode_bo(bo, i * XE_PAGE_SIZE, + entry = vm->pt_ops->pde_encode_bo(bo, (u64)i * XE_PAGE_SIZE, pat_index); xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + @@ -291,7 +291,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) drm_suballoc_manager_init(&m->vm_update_sa, - (map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * + (size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * NUM_VMUSA_UNIT_PER_PAGE, 0); m->pt_bo = bo; @@ -490,7 +490,7 @@ static void emit_pte(struct xe_migrate *m, struct xe_vm *vm = m->q->vm; u16 pat_index; u32 ptes; - u64 ofs = at_pt * XE_PAGE_SIZE; + u64 ofs = (u64)at_pt * XE_PAGE_SIZE; u64 cur_ofs; /* Indirect access needs compression enabled uncached PAT index */ From f76646c83f028c62853c23dac49204232e903597 Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Thu, 4 Apr 2024 09:12:56 -0700 Subject: [PATCH 162/270] drm/xe: Label RING_CONTEXT_CONTROL as masked RING_CONTEXT_CONTROL is a masked register. v2: Also clean up setting register value (Lucas) Reviewed-by: Matt Roper Reviewed-by: Lucas De Marchi Signed-off-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20240404161256.3852502-1-ashutosh.dixit@intel.com (cherry picked from commit dc30c6e7149baaae4288c742de95212b31f07438) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/regs/xe_engine_regs.h | 2 +- drivers/gpu/drm/xe/xe_lrc.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index 0b1266c88a6a..deddc8be48c0 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -125,7 +125,7 @@ #define RING_EXECLIST_STATUS_LO(base) XE_REG((base) + 0x234) #define RING_EXECLIST_STATUS_HI(base) XE_REG((base) + 0x234 + 4) -#define RING_CONTEXT_CONTROL(base) XE_REG((base) + 0x244) +#define RING_CONTEXT_CONTROL(base) XE_REG((base) + 0x244, XE_REG_OPTION_MASKED) #define CTX_CTRL_INHIBIT_SYN_CTX_SWITCH REG_BIT(3) #define CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT REG_BIT(0) diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index 1426febe86eb..57066faf575e 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -525,9 +525,8 @@ static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) { - regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH) | - _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) | - CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; + regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | + CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); /* TODO: Timestamp */ } From b372e96bd0a32729d55d27f613c8bc80708a82e1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 25 Mar 2024 09:21:20 +1100 Subject: [PATCH 163/270] ceph: redirty page before returning AOP_WRITEPAGE_ACTIVATE The page has been marked clean before writepage is called. If we don't redirty it before postponing the write, it might never get written. Cc: stable@vger.kernel.org Fixes: 503d4fa6ee28 ("ceph: remove reliance on bdi congestion") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1340d77124ae..ee9caf7916fb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -795,8 +795,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) ihold(inode); if (wbc->sync_mode == WB_SYNC_NONE && - ceph_inode_to_fs_client(inode)->write_congested) + ceph_inode_to_fs_client(inode)->write_congested) { + redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; + } wait_on_page_fscache(page); From 17f8dc2db52185460f212052f3a692c1fdc167ba Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 9 Apr 2024 08:56:03 +0800 Subject: [PATCH 164/270] ceph: switch to use cap_delay_lock for the unlink delay list The same list item will be used in both cap_delay_list and cap_unlink_delay_list, so it's buggy to use two different locks to protect them. Cc: stable@vger.kernel.org Fixes: dbc347ef7f0c ("ceph: add ceph_cap_unlink_work to fire check_caps() immediately") Link: https://lists.ceph.io/hyperkitty/list/ceph-users@ceph.io/thread/AODC76VXRAMXKLFDCTK4TKFDDPWUSCN5 Reported-by: Marc Ruhmann Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Tested-by: Marc Ruhmann Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 ++-- fs/ceph/mds_client.c | 9 ++++----- fs/ceph/mds_client.h | 3 +-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 55051ad09c19..c4941ba245ac 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4783,13 +4783,13 @@ int ceph_drop_caps_for_unlink(struct inode *inode) doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) list_del_init(&ci->i_cap_delay_list); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_unlink_delay_list); - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); /* * Fire the work immediately, because the MDS maybe diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3ab9c268a8bb..360b686c3c67 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2504,7 +2504,7 @@ static void ceph_cap_unlink_work(struct work_struct *work) struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "begin\n"); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_unlink_delay_list)) { struct ceph_inode_info *ci; struct inode *inode; @@ -2516,15 +2516,15 @@ static void ceph_cap_unlink_work(struct work_struct *work) inode = igrab(&ci->netfs.inode); if (inode) { - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "on %p %llx.%llx\n", inode, ceph_vinop(inode)); ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); } } - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); } @@ -5404,7 +5404,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); - spin_lock_init(&mdsc->cap_unlink_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 03f8ff00874f..b88e80415224 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -461,9 +461,8 @@ struct ceph_mds_client { struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ - spinlock_t cap_delay_lock; /* protects cap_delay_list */ struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ - spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */ + spinlock_t cap_delay_lock; /* protects cap_delay_list and cap_unlink_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; From d3e0469306793972fd2b1ea016fa7ab0658c9849 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 9 Apr 2024 07:01:57 -0400 Subject: [PATCH 165/270] MAINTAINERS: remove myself as a Reviewer for Ceph It has been a couple of years since I stepped down as CephFS maintainer. I'm not involved in any meaningful way with the project these days, so while I'm happy to help review the occasional patch, I don't need to be cc'ed on all of them. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index aea47e04c3a5..c45e2c3f6ff9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4869,7 +4869,6 @@ F: drivers/power/supply/cw2015_battery.c CEPH COMMON CODE (LIBCEPH) M: Ilya Dryomov M: Xiubo Li -R: Jeff Layton L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ @@ -4881,7 +4880,6 @@ F: net/ceph/ CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) M: Xiubo Li M: Ilya Dryomov -R: Jeff Layton L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ From 28e0947651ce6a2200b9a7eceb93282e97d7e51a Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 6 Apr 2024 23:16:08 -0500 Subject: [PATCH 166/270] smb3: fix Open files on server counter going negative We were decrementing the count of open files on server twice for the case where we were closing cached directories. Fixes: 8e843bf38f7b ("cifs: return a single-use cfid if we did not get a lease") Cc: stable@vger.kernel.org Acked-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 13a9d7acf8f8..0ff2491c311d 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -433,8 +433,8 @@ smb2_close_cached_fid(struct kref *ref) if (cfid->is_open) { rc = SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); - if (rc != -EBUSY && rc != -EAGAIN) - atomic_dec(&cfid->tcon->num_remote_opens); + if (rc) /* should we retry on -EBUSY or -EAGAIN? */ + cifs_dbg(VFS, "close cached dir rc %d\n", rc); } free_cached_dir(cfid); From c6ff459037b2e35450af2351037eac4c8aca1d6b Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 9 Apr 2024 11:28:59 -0300 Subject: [PATCH 167/270] smb: client: instantiate when creating SFU files In cifs_sfu_make_node(), on success, instantiate rather than leave it with dentry unhashed negative to support callers that expect mknod(2) to always instantiate. This fixes the following test case: mount.cifs //srv/share /mnt -o ...,sfu mkfifo /mnt/fifo ./xfstests/ltp/growfiles -b -W test -e 1 -u -i 0 -L 30 /mnt/fifo ... BUG: unable to handle page fault for address: 000000034cec4e58 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 1 PREEMPT SMP PTI CPU: 0 PID: 138098 Comm: growfiles Kdump: loaded Not tainted 5.14.0-436.3987_1240945149.el9.x86_64 #1 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:_raw_callee_save__kvm_vcpu_is_preempted+0x0/0x20 Code: e8 15 d9 61 00 e9 63 ff ff ff 41 bd ea ff ff ff e9 58 ff ff ff e8 d0 71 c0 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 <48> 8b 04 fd 60 2b c1 99 80 b8 90 50 03 00 00 0f 95 c0 c3 cc cc cc RSP: 0018:ffffb6a143cf7cf8 EFLAGS: 00010206 RAX: ffff8a9bc30fb038 RBX: ffff8a9bc666a200 RCX: ffff8a9cc0260000 RDX: 00000000736f622e RSI: ffff8a9bc30fb038 RDI: 000000007665645f RBP: ffffb6a143cf7d70 R08: 0000000000001000 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000000 R12: ffff8a9bc666a200 R13: 0000559a302a12b0 R14: 0000000000001000 R15: 0000000000000000 FS: 00007fbed1dbb740(0000) GS:ffff8a9cf0000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000034cec4e58 CR3: 0000000128ec6006 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? show_trace_log_lvl+0x1c4/0x2df ? show_trace_log_lvl+0x1c4/0x2df ? __mutex_lock.constprop.0+0x5f7/0x6a0 ? __die_body.cold+0x8/0xd ? page_fault_oops+0x134/0x170 ? exc_page_fault+0x62/0x150 ? asm_exc_page_fault+0x22/0x30 ? _pfx_raw_callee_save__kvm_vcpu_is_preempted+0x10/0x10 __mutex_lock.constprop.0+0x5f7/0x6a0 ? __mod_memcg_lruvec_state+0x84/0xd0 pipe_write+0x47/0x650 ? do_anonymous_page+0x258/0x410 ? inode_security+0x22/0x60 ? selinux_file_permission+0x108/0x150 vfs_write+0x2cb/0x410 ksys_write+0x5f/0xe0 do_syscall_64+0x5c/0xf0 ? syscall_exit_to_user_mode+0x22/0x40 ? do_syscall_64+0x6b/0xf0 ? sched_clock_cpu+0x9/0xc0 ? exc_page_fault+0x62/0x150 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Cc: stable@vger.kernel.org Fixes: 72bc63f5e23a ("smb3: fix creating FIFOs when mounting with "sfu" mount option") Suggested-by: Al Viro Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 94 ++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index b156eefa75d7..78c94d0350fe 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4964,68 +4964,84 @@ static int smb2_next_header(struct TCP_Server_Info *server, char *buf, return 0; } -int cifs_sfu_make_node(unsigned int xid, struct inode *inode, - struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) +static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) { - struct cifs_open_info_data buf = {}; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {}; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_fid fid; unsigned int bytes_written; - struct win_dev *pdev; + struct win_dev pdev = {}; struct kvec iov[2]; __u32 oplock = server->oplocks ? REQ_OPLOCK : 0; int rc; - if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode)) + switch (mode & S_IFMT) { + case S_IFCHR: + strscpy(pdev.type, "IntxCHR"); + pdev.major = cpu_to_le64(MAJOR(dev)); + pdev.minor = cpu_to_le64(MINOR(dev)); + break; + case S_IFBLK: + strscpy(pdev.type, "IntxBLK"); + pdev.major = cpu_to_le64(MAJOR(dev)); + pdev.minor = cpu_to_le64(MINOR(dev)); + break; + case S_IFIFO: + strscpy(pdev.type, "LnxFIFO"); + break; + default: return -EPERM; + } - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .cifs_sb = cifs_sb, - .desired_access = GENERIC_WRITE, - .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL), - .disposition = FILE_CREATE, - .path = full_path, - .fid = &fid, - }; + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, GENERIC_WRITE, + FILE_CREATE, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL, ACL_NO_MODE); + oparms.fid = &fid; - rc = server->ops->open(xid, &oparms, &oplock, &buf); + rc = server->ops->open(xid, &oparms, &oplock, NULL); if (rc) return rc; - /* - * BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely. - */ - pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; - io_parms.length = sizeof(*pdev); - iov[1].iov_base = pdev; - iov[1].iov_len = sizeof(*pdev); - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - } else if (S_ISFIFO(mode)) { - memcpy(pdev->type, "LnxFIFO", 8); - } + io_parms.length = sizeof(pdev); + iov[1].iov_base = &pdev; + iov[1].iov_len = sizeof(pdev); rc = server->ops->sync_write(xid, &fid, &io_parms, &bytes_written, iov, 1); server->ops->close(xid, tcon, &fid); - d_drop(dentry); - /* FIXME: add code here to set EAs */ - cifs_free_open_info(&buf); + return rc; +} + +int cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct inode *new = NULL; + int rc; + + rc = __cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); + if (rc) + return rc; + + if (tcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&new, full_path, NULL, + inode->i_sb, xid); + } else if (tcon->unix_ext) { + rc = cifs_get_inode_info_unix(&new, full_path, + inode->i_sb, xid); + } else { + rc = cifs_get_inode_info(&new, full_path, NULL, + inode->i_sb, xid, NULL); + } + if (!rc) + d_instantiate(dentry, new); return rc; } From 35f834265e0dc78b003aa0d1af65cafb89666b76 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 4 Apr 2024 18:06:56 -0500 Subject: [PATCH 168/270] smb3: fix broken reconnect when password changing on the server by allowing password rotation There are various use cases that are becoming more common in which password changes are scheduled on a server(s) periodically but the clients connected to this server need to stay connected (even in the face of brief network reconnects) due to mounts which can not be easily unmounted and mounted at will, and servers that do password rotation do not always have the ability to tell the clients exactly when to the new password will be effective, so add support for an alt password ("password2=") on mount (and also remount) so that we can anticipate the upcoming change to the server without risking breaking existing mounts. An alternative would have been to use the kernel keyring for this but the processes doing the reconnect do not have access to the keyring but do have access to the ses structure. Reviewed-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 + fs/smb/client/connect.c | 8 ++++++++ fs/smb/client/fs_context.c | 21 +++++++++++++++++++++ fs/smb/client/fs_context.h | 2 ++ fs/smb/client/misc.c | 1 + fs/smb/client/smb2pdu.c | 11 +++++++++++ 6 files changed, 44 insertions(+) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index f6a302205f89..d6669ce4ae87 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1077,6 +1077,7 @@ struct cifs_ses { and after mount option parsing we fill it */ char *domainName; char *password; + char *password2; /* When key rotation used, new password may be set before it expires */ char workstation_name[CIFS_MAX_WORKSTATION_LEN]; struct session_key auth_key; struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 85679ae106fd..4e35970681bf 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2183,6 +2183,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) } ++delim; + /* BB consider adding support for password2 (Key Rotation) for multiuser in future */ ctx->password = kstrndup(delim, len, GFP_KERNEL); if (!ctx->password) { cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n", @@ -2206,6 +2207,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) kfree(ctx->username); ctx->username = NULL; kfree_sensitive(ctx->password); + /* no need to free ctx->password2 since not allocated in this path */ ctx->password = NULL; goto out_key_put; } @@ -2317,6 +2319,12 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) if (!ses->password) goto get_ses_fail; } + /* ctx->password freed at unmount */ + if (ctx->password2) { + ses->password2 = kstrdup(ctx->password2, GFP_KERNEL); + if (!ses->password2) + goto get_ses_fail; + } if (ctx->domainname) { ses->domainName = kstrdup(ctx->domainname, GFP_KERNEL); if (!ses->domainName) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index b7bfe705b2c4..6c727d8c31e8 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -162,6 +162,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_string("username", Opt_user), fsparam_string("pass", Opt_pass), fsparam_string("password", Opt_pass), + fsparam_string("password2", Opt_pass2), fsparam_string("ip", Opt_ip), fsparam_string("addr", Opt_ip), fsparam_string("domain", Opt_domain), @@ -345,6 +346,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx new_ctx->nodename = NULL; new_ctx->username = NULL; new_ctx->password = NULL; + new_ctx->password2 = NULL; new_ctx->server_hostname = NULL; new_ctx->domainname = NULL; new_ctx->UNC = NULL; @@ -357,6 +359,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(prepath); DUP_CTX_STR(username); DUP_CTX_STR(password); + DUP_CTX_STR(password2); DUP_CTX_STR(server_hostname); DUP_CTX_STR(UNC); DUP_CTX_STR(source); @@ -905,6 +908,8 @@ static int smb3_reconfigure(struct fs_context *fc) else { kfree_sensitive(ses->password); ses->password = kstrdup(ctx->password, GFP_KERNEL); + kfree_sensitive(ses->password2); + ses->password2 = kstrdup(ctx->password2, GFP_KERNEL); } STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); @@ -1305,6 +1310,18 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } break; + case Opt_pass2: + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; + if (strlen(param->string) == 0) + break; + + ctx->password2 = kstrdup(param->string, GFP_KERNEL); + if (ctx->password2 == NULL) { + cifs_errorf(fc, "OOM when copying password2 string\n"); + goto cifs_parse_mount_err; + } + break; case Opt_ip: if (strlen(param->string) == 0) { ctx->got_ip = false; @@ -1608,6 +1625,8 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_parse_mount_err: kfree_sensitive(ctx->password); ctx->password = NULL; + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; return -EINVAL; } @@ -1713,6 +1732,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->username = NULL; kfree_sensitive(ctx->password); ctx->password = NULL; + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; kfree(ctx->server_hostname); ctx->server_hostname = NULL; kfree(ctx->UNC); diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 8a35645e0b65..a947bddeba27 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -145,6 +145,7 @@ enum cifs_param { Opt_source, Opt_user, Opt_pass, + Opt_pass2, Opt_ip, Opt_domain, Opt_srcaddr, @@ -177,6 +178,7 @@ struct smb3_fs_context { char *username; char *password; + char *password2; char *domainname; char *source; char *server_hostname; diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 33ac4f8f5050..7d15a1969b81 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -98,6 +98,7 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->serverDomain); kfree(buf_to_free->serverNOS); kfree_sensitive(buf_to_free->password); + kfree_sensitive(buf_to_free->password2); kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kfree_sensitive(buf_to_free->auth_key.response); diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index c0c4933af5fc..86c647a947cc 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -367,6 +367,17 @@ again: } rc = cifs_setup_session(0, ses, server, nls_codepage); + if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { + /* + * Try alternate password for next reconnect (key rotation + * could be enabled on the server e.g.) if an alternate + * password is available and the current password is expired, + * but do not swap on non pwd related errors like host down + */ + if (ses->password2) + swap(ses->password2, ses->password); + } + if ((rc == -EACCES) && !tcon->retry) { mutex_unlock(&ses->session_mutex); rc = -EHOSTDOWN; From a8fa658eebe8b17fc852482da52f8841be8931d6 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 22 Mar 2024 14:26:04 +0800 Subject: [PATCH 169/270] eventfs: Fix kernel-doc comments to functions This commit fix kernel-doc style comments with complete parameter descriptions for the lookup_file(),lookup_dir_entry() and lookup_file_dentry(). Link: https://lore.kernel.org/linux-trace-kernel/20240322062604.28862-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index dc067eeb6387..894c6ca1e500 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -336,6 +336,7 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode, /** * lookup_file - look up a file in the tracefs filesystem + * @parent_ei: Pointer to the eventfs_inode that represents parent of the file * @dentry: the dentry to look up * @mode: the permission that the file should have. * @attr: saved attributes changed by user @@ -389,6 +390,7 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei, /** * lookup_dir_entry - look up a dir in the tracefs filesystem * @dentry: the directory to look up + * @pei: Pointer to the parent eventfs_inode if available * @ei: the eventfs_inode that represents the directory to create * * This function will look up a dentry for a directory represented by @@ -478,16 +480,20 @@ void eventfs_d_release(struct dentry *dentry) /** * lookup_file_dentry - create a dentry for a file of an eventfs_inode + * @dentry: The parent dentry under which the new file's dentry will be created * @ei: the eventfs_inode that the file will be created under * @idx: the index into the entry_attrs[] of the @ei - * @parent: The parent dentry of the created file. - * @name: The name of the file to create * @mode: The mode of the file. * @data: The data to use to set the inode of the file with on open() * @fops: The fops of the file to be created. * - * Create a dentry for a file of an eventfs_inode @ei and place it into the - * address located at @e_dentry. + * This function creates a dentry for a file associated with an + * eventfs_inode @ei. It uses the entry attributes specified by @idx, + * if available. The file will have the specified @mode and its inode will be + * set up with @data upon open. The file operations will be set to @fops. + * + * Return: Returns a pointer to the newly created file's dentry or an error + * pointer. */ static struct dentry * lookup_file_dentry(struct dentry *dentry, From d96c36004e31e2baaf8ea1b449b7d0b2c2bfb41a Mon Sep 17 00:00:00 2001 From: Prasad Pandit Date: Fri, 22 Mar 2024 17:48:01 +0530 Subject: [PATCH 170/270] tracing: Fix FTRACE_RECORD_RECURSION_SIZE Kconfig entry Fix FTRACE_RECORD_RECURSION_SIZE entry, replace tab with a space character. It helps Kconfig parsers to read file without error. Link: https://lore.kernel.org/linux-trace-kernel/20240322121801.1803948-1-ppandit@redhat.com Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 773c16705058 ("ftrace: Add recording of functions that caused recursion") Signed-off-by: Prasad Pandit Reviewed-by: Randy Dunlap Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61c541c36596..47345bf1d4a9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -965,7 +965,7 @@ config FTRACE_RECORD_RECURSION config FTRACE_RECORD_RECURSION_SIZE int "Max number of recursed functions to record" - default 128 + default 128 depends on FTRACE_RECORD_RECURSION help This defines the limit of number of functions that can be From 5281ec83454d70d98b71f1836fb16512566c01cd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:24 +0200 Subject: [PATCH 171/270] tracing: hide unused ftrace_event_id_fops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_PERF_EVENTS, a 'make W=1' build produces a warning about the unused ftrace_event_id_fops variable: kernel/trace/trace_events.c:2155:37: error: 'ftrace_event_id_fops' defined but not used [-Werror=unused-const-variable=] 2155 | static const struct file_operations ftrace_event_id_fops = { Hide this in the same #ifdef as the reference to it. Link: https://lore.kernel.org/linux-trace-kernel/20240403080702.3509288-7-arnd@kernel.org Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Mathieu Desnoyers Cc: Zheng Yejian Cc: Kees Cook Cc: Ajay Kaher Cc: Jinjie Ruan Cc: Clément Léger Cc: Dan Carpenter Cc: "Tzvetomir Stoyanov (VMware)" Fixes: 620a30e97feb ("tracing: Don't pass file_operations array to event_create_dir()") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7c364b87352e..52f75c36bbca 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1670,6 +1670,7 @@ static int trace_format_open(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_PERF_EVENTS static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -1684,6 +1685,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } +#endif static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, @@ -2152,10 +2154,12 @@ static const struct file_operations ftrace_event_format_fops = { .release = seq_release, }; +#ifdef CONFIG_PERF_EVENTS static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, .llseek = default_llseek, }; +#endif static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_file_tr, From ffe3986fece696cf65e0ef99e74c75f848be8e30 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 9 Apr 2024 15:13:09 -0400 Subject: [PATCH 172/270] ring-buffer: Only update pages_touched when a new page is touched The "buffer_percent" logic that is used by the ring buffer splice code to only wake up the tasks when there's no data after the buffer is filled to the percentage of the "buffer_percent" file is dependent on three variables that determine the amount of data that is in the ring buffer: 1) pages_read - incremented whenever a new sub-buffer is consumed 2) pages_lost - incremented every time a writer overwrites a sub-buffer 3) pages_touched - incremented when a write goes to a new sub-buffer The percentage is the calculation of: (pages_touched - (pages_lost + pages_read)) / nr_pages Basically, the amount of data is the total number of sub-bufs that have been touched, minus the number of sub-bufs lost and sub-bufs consumed. This is divided by the total count to give the buffer percentage. When the percentage is greater than the value in the "buffer_percent" file, it wakes up splice readers waiting for that amount. It was observed that over time, the amount read from the splice was constantly decreasing the longer the trace was running. That is, if one asked for 60%, it would read over 60% when it first starts tracing, but then it would be woken up at under 60% and would slowly decrease the amount of data read after being woken up, where the amount becomes much less than the buffer percent. This was due to an accounting of the pages_touched incrementation. This value is incremented whenever a writer transfers to a new sub-buffer. But the place where it was incremented was incorrect. If a writer overflowed the current sub-buffer it would go to the next one. If it gets preempted by an interrupt at that time, and the interrupt performs a trace, it too will end up going to the next sub-buffer. But only one should increment the counter. Unfortunately, that was not the case. Change the cmpxchg() that does the real switch of the tail-page into a try_cmpxchg(), and on success, perform the increment of pages_touched. This will only increment the counter once for when the writer moves to a new sub-buffer, and not when there's a race and is incremented for when a writer and its preempting writer both move to the same new sub-buffer. Link: https://lore.kernel.org/linux-trace-kernel/20240409151309.0d0e5056@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Fixes: 2c2b0a78b3739 ("ring-buffer: Add percentage of ring buffer full to wake up reader") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 25476ead681b..6511dc3a00da 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1393,7 +1393,6 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); - local_inc(&cpu_buffer->pages_touched); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. @@ -1430,8 +1429,9 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, */ local_set(&next_page->page->commit, 0); - /* Again, either we update tail_page or an interrupt does */ - (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); + /* Either we update tail_page or an interrupt does */ + if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page)) + local_inc(&cpu_buffer->pages_touched); } } From 3b0daecfeac0103aba8b293df07a0cbaf8b43f29 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 12 Apr 2024 06:11:25 +1000 Subject: [PATCH 173/270] amdkfd: use calloc instead of kzalloc to avoid integer overflow This uses calloc instead of doing the multiplication which might overflow. Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f9631f4b1a02..55aa74cbc532 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -779,8 +779,8 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, * nodes, but not more than args->num_of_nodes as that is * the amount of memory allocated by user */ - pa = kzalloc((sizeof(struct kfd_process_device_apertures) * - args->num_of_nodes), GFP_KERNEL); + pa = kcalloc(args->num_of_nodes, sizeof(struct kfd_process_device_apertures), + GFP_KERNEL); if (!pa) return -ENOMEM; From 2b3e79fea66e166622a454715ce981432ac8c6e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 01:01:11 -0400 Subject: [PATCH 174/270] bcachefs: Don't use bch2_btree_node_lock_write_nofail() in btree split path It turns out - btree splits happen with the rest of the transaction still locked, to avoid unnecessary restarts, which means using nofail doesn't work here - we can deadlock. Fortunately, we now have the ability to return errors here. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 41 ++++++++++++++++++----------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index c4a5e83a56a4..29a5bc7d8789 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1280,23 +1280,29 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) bch2_recalc_btree_reserve(c); } -static void bch2_btree_set_root(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +static int bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + bool nofail) { struct bch_fs *c = as->c; - struct btree *old; trace_and_count(c, btree_node_set_root, trans, b); - old = btree_node_root(c, b); + struct btree *old = btree_node_root(c, b); /* * Ensure no one is using the old root while we switch to the * new root: */ - bch2_btree_node_lock_write_nofail(trans, path, &old->c); + if (nofail) { + bch2_btree_node_lock_write_nofail(trans, path, &old->c); + } else { + int ret = bch2_btree_node_lock_write(trans, path, &old->c); + if (ret) + return ret; + } bch2_btree_set_root_inmem(c, b); @@ -1310,6 +1316,7 @@ static void bch2_btree_set_root(struct btree_update *as, * depend on the new root would have to update the new root. */ bch2_btree_node_unlock_write(trans, path, old); + return 0; } /* Interior node updates: */ @@ -1652,15 +1659,16 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); - if (ret) - goto err; } else if (n3) { - bch2_btree_set_root(as, trans, trans->paths + path, n3); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, trans, trans->paths + path, n1); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); } + if (ret) + goto err; + if (n3) { bch2_btree_update_get_open_buckets(as, n3); bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); @@ -1863,7 +1871,9 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * bch2_keylist_add(&as->parent_keys, &b->key); btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); - bch2_btree_set_root(as, trans, path, n); + int ret = bch2_btree_set_root(as, trans, path, n, true); + BUG_ON(ret); + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); bch2_trans_node_add(trans, path, n); @@ -2106,12 +2116,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); - if (ret) - goto err; } else { - bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n); + ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); } + if (ret) + goto err; + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); From 84b1cec4fac5889b6d138d4b5ff6f2d5ef126c5e Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 4 Apr 2024 10:27:17 +0000 Subject: [PATCH 175/270] iommu/amd: Fix possible irq lock inversion dependency issue LOCKDEP detector reported below warning: ---------------------------------------- [ 23.796949] ======================================================== [ 23.796950] WARNING: possible irq lock inversion dependency detected [ 23.796952] 6.8.0fix+ #811 Not tainted [ 23.796954] -------------------------------------------------------- [ 23.796954] kworker/0:1/8 just changed the state of lock: [ 23.796956] ff365325e084a9b8 (&domain->lock){..-.}-{3:3}, at: amd_iommu_flush_iotlb_all+0x1f/0x50 [ 23.796969] but this lock took another, SOFTIRQ-unsafe lock in the past: [ 23.796970] (pd_bitmap_lock){+.+.}-{3:3} [ 23.796972] and interrupts could create inverse lock ordering between them. [ 23.796973] other info that might help us debug this: [ 23.796974] Chain exists of: &domain->lock --> &dev_data->lock --> pd_bitmap_lock [ 23.796980] Possible interrupt unsafe locking scenario: [ 23.796981] CPU0 CPU1 [ 23.796982] ---- ---- [ 23.796983] lock(pd_bitmap_lock); [ 23.796985] local_irq_disable(); [ 23.796985] lock(&domain->lock); [ 23.796988] lock(&dev_data->lock); [ 23.796990] [ 23.796991] lock(&domain->lock); Fix this issue by disabling interrupt when acquiring pd_bitmap_lock. Note that this is temporary fix. We have a plan to replace custom bitmap allocator with IDA allocator. Fixes: 87a6f1f22c97 ("iommu/amd: Introduce per-device domain ID to fix potential TLB aliasing issue") Reviewed-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20240404102717.6705-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d35c1b8c8e65..e692217fcb28 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1692,26 +1692,29 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, static u16 domain_id_alloc(void) { + unsigned long flags; int id; - spin_lock(&pd_bitmap_lock); + spin_lock_irqsave(&pd_bitmap_lock, flags); id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); BUG_ON(id == 0); if (id > 0 && id < MAX_DOMAIN_ID) __set_bit(id, amd_iommu_pd_alloc_bitmap); else id = 0; - spin_unlock(&pd_bitmap_lock); + spin_unlock_irqrestore(&pd_bitmap_lock, flags); return id; } static void domain_id_free(int id) { - spin_lock(&pd_bitmap_lock); + unsigned long flags; + + spin_lock_irqsave(&pd_bitmap_lock, flags); if (id > 0 && id < MAX_DOMAIN_ID) __clear_bit(id, amd_iommu_pd_alloc_bitmap); - spin_unlock(&pd_bitmap_lock); + spin_unlock_irqrestore(&pd_bitmap_lock, flags); } static void free_gcr3_tbl_level1(u64 *tbl) From b650b38b006053ef96e90b8e3f7e6edc7845cd57 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 10 Apr 2024 08:57:02 +0000 Subject: [PATCH 176/270] iommu/amd: Do not enable SNP when V2 page table is enabled DTE[Mode]=0 is not supported when SNP is enabled in the host. That means to support SNP, IOMMU must be configured with V1 page table (See IOMMU spec [1] for the details). If user passes kernel command line to configure IOMMU domains with v2 page table (amd_iommu=pgtbl_v2) then disable SNP as the user asked by not forcing the page table to v1. [1] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_IOMMU.pdf Cc: Ashish Kalra Cc: Michael Roth Cc: Tom Lendacky Signed-off-by: Vasant Hegde Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240410085702.31869-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 33228c1c8980..a714eb08c08b 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3232,28 +3232,29 @@ static void iommu_snp_enable(void) return; /* * The SNP support requires that IOMMU must be enabled, and is - * not configured in the passthrough mode. + * configured with V1 page table (DTE[Mode] = 0 is not supported). */ if (no_iommu || iommu_default_passthrough()) { pr_err("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); - cc_platform_clear(CC_ATTR_HOST_SEV_SNP); - return; + goto disable_snp; + } + + if (amd_iommu_pgtable != AMD_IOMMU_V1) { + pr_warn("SNP: IOMMU is configured with V2 page table mode, SNP cannot be supported.\n"); + goto disable_snp; } amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) { pr_err("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); - cc_platform_clear(CC_ATTR_HOST_SEV_SNP); - return; + goto disable_snp; } pr_info("IOMMU SNP support enabled.\n"); + return; - /* Enforce IOMMU v1 pagetable when SNP is enabled. */ - if (amd_iommu_pgtable != AMD_IOMMU_V1) { - pr_warn("Forcing use of AMD IOMMU v1 page table due to SNP.\n"); - amd_iommu_pgtable = AMD_IOMMU_V1; - } +disable_snp: + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); #endif } From 7537e31df80cb58c27f3b6fef702534ea87a5957 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 10 Apr 2024 18:41:09 +0200 Subject: [PATCH 177/270] iommu: mtk: fix module autoloading Add MODULE_DEVICE_TABLE(), so modules could be properly autoloaded based on the alias from of_device_id table. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240410164109.233308-1-krzk@kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu.c | 1 + drivers/iommu/mtk_iommu_v1.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index b8c47f18bc26..6a2707fe7a78 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -1790,6 +1790,7 @@ static const struct of_device_id mtk_iommu_of_ids[] = { { .compatible = "mediatek,mt8365-m4u", .data = &mt8365_data}, {} }; +MODULE_DEVICE_TABLE(of, mtk_iommu_of_ids); static struct platform_driver mtk_iommu_driver = { .probe = mtk_iommu_probe, diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index a9fa2a54dc9b..d6e4002200bd 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -600,6 +600,7 @@ static const struct of_device_id mtk_iommu_v1_of_ids[] = { { .compatible = "mediatek,mt2701-m4u", }, {} }; +MODULE_DEVICE_TABLE(of, mtk_iommu_v1_of_ids); static const struct component_master_ops mtk_iommu_v1_com_ops = { .bind = mtk_iommu_v1_bind, From 36d4fe147c870f6d3f6602befd7ef44393a1c87a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:50 -0700 Subject: [PATCH 178/270] x86/bugs: Remove CONFIG_BHI_MITIGATION_AUTO and spectre_bhi=auto Unlike most other mitigations' "auto" options, spectre_bhi=auto only mitigates newer systems, which is confusing and not particularly useful. Remove it. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Cc: Sean Christopherson Cc: Linus Torvalds Link: https://lore.kernel.org/r/412e9dc87971b622bbbaf64740ebc1f140bff343.1712813475.git.jpoimboe@kernel.org --- Documentation/admin-guide/hw-vuln/spectre.rst | 4 ---- Documentation/admin-guide/kernel-parameters.txt | 3 --- arch/x86/Kconfig | 5 ----- arch/x86/kernel/cpu/bugs.c | 10 +--------- 4 files changed, 1 insertion(+), 21 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 5a39acf82483..25a04cda4c2c 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -669,10 +669,6 @@ kernel command line. needed. off Disable the mitigation. - auto - Enable the HW mitigation if needed, but - *don't* enable the SW mitigation except for KVM. - The system may be vulnerable. For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a3874cc97892..902ecd92a29f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6072,9 +6072,6 @@ on - (default) Enable the HW or SW mitigation as needed. off - Disable the mitigation. - auto - Enable the HW mitigation if needed, but - *don't* enable the SW mitigation except - for KVM. The system may be vulnerable. spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 10a6251f58f3..b63b6767a63d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2651,11 +2651,6 @@ config SPECTRE_BHI_OFF bool "off" help Equivalent to setting spectre_bhi=off command line parameter. -config SPECTRE_BHI_AUTO - bool "auto" - depends on BROKEN - help - Equivalent to setting spectre_bhi=auto command line parameter. endchoice diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9eeb60f5fbb3..6af4780a18ed 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1625,13 +1625,10 @@ static bool __init spec_ctrl_bhi_dis(void) enum bhi_mitigations { BHI_MITIGATION_OFF, BHI_MITIGATION_ON, - BHI_MITIGATION_AUTO, }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : - IS_ENABLED(CONFIG_SPECTRE_BHI_OFF) ? BHI_MITIGATION_OFF : - BHI_MITIGATION_AUTO; + IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { @@ -1642,8 +1639,6 @@ static int __init spectre_bhi_parse_cmdline(char *str) bhi_mitigation = BHI_MITIGATION_OFF; else if (!strcmp(str, "on")) bhi_mitigation = BHI_MITIGATION_ON; - else if (!strcmp(str, "auto")) - bhi_mitigation = BHI_MITIGATION_AUTO; else pr_err("Ignoring unknown spectre_bhi option (%s)", str); @@ -1673,9 +1668,6 @@ static void __init bhi_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n"); - if (bhi_mitigation == BHI_MITIGATION_AUTO) - return; - /* Mitigate syscalls when the mitigation is forced =on */ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n"); From 4f511739c54b549061993b53fc0380f48dfca23b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:51 -0700 Subject: [PATCH 179/270] x86/bugs: Replace CONFIG_SPECTRE_BHI_{ON,OFF} with CONFIG_MITIGATION_SPECTRE_BHI For consistency with the other CONFIG_MITIGATION_* options, replace the CONFIG_SPECTRE_BHI_{ON,OFF} options with a single CONFIG_MITIGATION_SPECTRE_BHI option. [ mingo: Fix ] Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Sean Christopherson Cc: Linus Torvalds Cc: Nikolay Borisov Link: https://lore.kernel.org/r/3833812ea63e7fdbe36bf8b932e63f70d18e2a2a.1712813475.git.jpoimboe@kernel.org --- arch/x86/Kconfig | 17 +++-------------- arch/x86/kernel/cpu/bugs.c | 2 +- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b63b6767a63d..4474bf32d0a4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2633,27 +2633,16 @@ config MITIGATION_RFDS stored in floating point, vector and integer registers. See also -choice - prompt "Clear branch history" +config MITIGATION_SPECTRE_BHI + bool "Mitigate Spectre-BHB (Branch History Injection)" depends on CPU_SUP_INTEL - default SPECTRE_BHI_ON + default y help Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks where the branch history buffer is poisoned to speculatively steer indirect branches. See -config SPECTRE_BHI_ON - bool "on" - help - Equivalent to setting spectre_bhi=on command line parameter. -config SPECTRE_BHI_OFF - bool "off" - help - Equivalent to setting spectre_bhi=off command line parameter. - -endchoice - endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6af4780a18ed..ca295b0c1eee 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1628,7 +1628,7 @@ enum bhi_mitigations { }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { From 1b3108f6898ef2e03973d65255182792e94e2240 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Apr 2024 21:45:27 +0200 Subject: [PATCH 180/270] x86/cpu/amd: Make the CPUID 0x80000008 parser correct CPUID 0x80000008 ECX.cpu_nthreads describes the number of threads in the package. The parser uses this value to initialize the SMT domain level. That's wrong because cpu_nthreads does not describe the number of threads per physical core. So this needs to set the CORE domain level and let the later parsers set the SMT shift if available. Preset the SMT domain level with the assumption of one thread per core, which is correct ifrt here are no other CPUID leafs to parse, and propagate cpu_nthreads and the core level APIC bitwidth into the CORE domain. Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Reported-by: "kernelci.org bot" Reported-by: Laura Nao Signed-off-by: Thomas Gleixner Tested-by: Laura Nao Link: https://lore.kernel.org/r/20240410194311.535206450@linutronix.de --- arch/x86/kernel/cpu/topology_amd.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 1a8b3ad493af..79a85a4814ca 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -29,11 +29,21 @@ static bool parse_8000_0008(struct topo_scan *tscan) if (!sft) sft = get_count_order(ecx.cpu_nthreads + 1); - topology_set_dom(tscan, TOPO_SMT_DOMAIN, sft, ecx.cpu_nthreads + 1); + /* + * cpu_nthreads describes the number of threads in the package + * sft is the number of APIC ID bits per package + * + * As the number of actual threads per core is not described in + * this leaf, just set the CORE domain shift and let the later + * parsers set SMT shift. Assume one thread per core by default + * which is correct if there are no other CPUID leafs to parse. + */ + topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1); return true; } -static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_id) +static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) { /* * Starting with Fam 17h the DIE domain could probably be used to @@ -73,12 +83,14 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) tscan->c->topo.initial_apicid = leaf.ext_apic_id; /* - * If leaf 0xb is available, then SMT shift is set already. If not - * take it from ecx.threads_per_core and use topo_update_dom() - - * topology_set_dom() would propagate and overwrite the already - * propagated CORE level. + * If leaf 0xb is available, then the domain shifts are set + * already and nothing to do here. */ if (!has_0xb) { + /* + * Leaf 0x80000008 set the CORE domain shift already. + * Update the SMT domain, but do not propagate it. + */ unsigned int nthreads = leaf.core_nthreads + 1; topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); From c064b536a8f9ab7c8e204da8f5a22f7420d0b56c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Apr 2024 21:45:28 +0200 Subject: [PATCH 181/270] x86/cpu/amd: Make the NODEID_MSR union actually work A system with NODEID_MSR was reported to crash during early boot without any output. The reason is that the union which is used for accessing the bitfields in the MSR is written wrongly and the resulting executable code accesses the wrong part of the MSR data. As a consequence a later division by that value results in 0 and that result is used for another division as divisor, which obviously does not work well. The magic world of C, unions and bitfields: union { u64 bita : 3, bitb : 3; u64 all; } x; x.all = foo(); a = x.bita; b = x.bitb; results in the effective executable code of: a = b = x.bita; because bita and bitb are treated as union members and therefore both end up at bit offset 0. Wrapping the bitfield into an anonymous struct: union { struct { u64 bita : 3, bitb : 3; }; u64 all; } x; works like expected. Rework the NODEID_MSR union in exactly that way to cure the problem. Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Reported-by: "kernelci.org bot" Reported-by: Laura Nao Signed-off-by: Thomas Gleixner Tested-by: Laura Nao Link: https://lore.kernel.org/r/20240410194311.596282919@linutronix.de Closes: https://lore.kernel.org/all/20240322175210.124416-1-laura.nao@collabora.com/ --- arch/x86/kernel/cpu/topology_amd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 79a85a4814ca..7f999aef1965 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -121,13 +121,13 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) static bool parse_fam10h_node_id(struct topo_scan *tscan) { - struct { - union { + union { + struct { u64 node_id : 3, nodes_per_pkg : 3, unused : 58; - u64 msr; }; + u64 msr; } nid; if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) From 7211274fe0ee352332255e41ab5e628b86e83994 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Apr 2024 18:55:38 +0200 Subject: [PATCH 182/270] x86/cpu/amd: Move TOPOEXT enablement into the topology parser The topology rework missed that early_init_amd() tries to re-enable the Topology Extensions when the BIOS disabled them. The new parser is invoked before early_init_amd() so the re-enable attempt happens too late. Move it into the AMD specific topology parser code where it belongs. Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/878r1j260l.ffs@tglx --- arch/x86/kernel/cpu/amd.c | 15 --------------- arch/x86/kernel/cpu/topology_amd.c | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9bf17c9c29da..cb9eece55904 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -535,7 +535,6 @@ clear_sev: static void early_init_amd(struct cpuinfo_x86 *c) { - u64 value; u32 dummy; if (c->x86 >= 0xf) @@ -603,20 +602,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_detect_mem_encrypt(c); - /* Re-enable TopologyExtensions if switched off by BIOS */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && - !cpu_has(c, X86_FEATURE_TOPOEXT)) { - - if (msr_set_bit(0xc0011005, 54) > 0) { - rdmsrl(0xc0011005, value); - if (value & BIT_64(54)) { - set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); - } - } - } - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 7f999aef1965..a7aa6eff4ae5 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -147,6 +147,26 @@ static void legacy_set_llc(struct topo_scan *tscan) tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; } +static void topoext_fixup(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + u64 msrval; + + /* Try to re-enable TopologyExtensions if switched off by BIOS */ + if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD || + c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) + return; + + if (msr_set_bit(0xc0011005, 54) <= 0) + return; + + rdmsrl(0xc0011005, msrval); + if (msrval & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } +} + static void parse_topology_amd(struct topo_scan *tscan) { bool has_0xb = false; @@ -176,6 +196,7 @@ static void parse_topology_amd(struct topo_scan *tscan) void cpu_parse_topology_amd(struct topo_scan *tscan) { tscan->amd_nodes_per_pkg = 1; + topoext_fixup(tscan); parse_topology_amd(tscan); if (tscan->amd_nodes_per_pkg > 1) From 5b3625a4f6422e8982f90f0c11b5546149c962b8 Mon Sep 17 00:00:00 2001 From: Xuchun Shang Date: Thu, 11 Apr 2024 11:07:42 +0800 Subject: [PATCH 183/270] iommu/vt-d: Fix wrong use of pasid config The commit "iommu/vt-d: Add IOMMU perfmon support" introduce IOMMU PMU feature, but use the wrong config when set pasid filter. Fixes: 7232ab8b89e9 ("iommu/vt-d: Add IOMMU perfmon support") Signed-off-by: Xuchun Shang Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20240401060753.3321318-1-xuchun.shang@linux.alibaba.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/perfmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c index cf43e798eca4..44083d01852d 100644 --- a/drivers/iommu/intel/perfmon.c +++ b/drivers/iommu/intel/perfmon.c @@ -438,7 +438,7 @@ static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu, iommu_pmu_set_filter(domain, event->attr.config1, IOMMU_PMU_FILTER_DOMAIN, idx, event->attr.config1); - iommu_pmu_set_filter(pasid, event->attr.config1, + iommu_pmu_set_filter(pasid, event->attr.config2, IOMMU_PMU_FILTER_PASID, idx, event->attr.config1); iommu_pmu_set_filter(ats, event->attr.config2, From a34f3e20ddff02c4f12df2c0635367394e64c63d Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 11 Apr 2024 11:07:43 +0800 Subject: [PATCH 184/270] iommu/vt-d: Allocate local memory for page request queue The page request queue is per IOMMU, its allocation should be made NUMA-aware for performance reasons. Fixes: a222a7f0bb6c ("iommu/vt-d: Implement page request handling") Signed-off-by: Jacob Pan Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240403214007.985600-1-jacob.jun.pan@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index c1bed89b1026..ee3b469e2da1 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -66,7 +66,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) struct page *pages; int irq, ret; - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); + pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); if (!pages) { pr_warn("IOMMU: %s: Failed to allocate page request queue\n", iommu->name); From 89436f4f54125b1297aec1f466efd8acb4ec613d Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 11 Apr 2024 11:07:44 +0800 Subject: [PATCH 185/270] iommu/vt-d: Fix WARN_ON in iommu probe path Commit 1a75cc710b95 ("iommu/vt-d: Use rbtree to track iommu probed devices") adds all devices probed by the iommu driver in a rbtree indexed by the source ID of each device. It assumes that each device has a unique source ID. This assumption is incorrect and the VT-d spec doesn't state this requirement either. The reason for using a rbtree to track devices is to look up the device with PCI bus and devfunc in the paths of handling ATS invalidation time out error and the PRI I/O page faults. Both are PCI ATS feature related. Only track the devices that have PCI ATS capabilities in the rbtree to avoid unnecessary WARN_ON in the iommu probe path. Otherwise, on some platforms below kernel splat will be displayed and the iommu probe results in failure. WARNING: CPU: 3 PID: 166 at drivers/iommu/intel/iommu.c:158 intel_iommu_probe_device+0x319/0xd90 Call Trace: ? __warn+0x7e/0x180 ? intel_iommu_probe_device+0x319/0xd90 ? report_bug+0x1f8/0x200 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? intel_iommu_probe_device+0x319/0xd90 ? debug_mutex_init+0x37/0x50 __iommu_probe_device+0xf2/0x4f0 iommu_probe_device+0x22/0x70 iommu_bus_notifier+0x1e/0x40 notifier_call_chain+0x46/0x150 blocking_notifier_call_chain+0x42/0x60 bus_notify+0x2f/0x50 device_add+0x5ed/0x7e0 platform_device_add+0xf5/0x240 mfd_add_devices+0x3f9/0x500 ? preempt_count_add+0x4c/0xa0 ? up_write+0xa2/0x1b0 ? __debugfs_create_file+0xe3/0x150 intel_lpss_probe+0x49f/0x5b0 ? pci_conf1_write+0xa3/0xf0 intel_lpss_pci_probe+0xcf/0x110 [intel_lpss_pci] pci_device_probe+0x95/0x120 really_probe+0xd9/0x370 ? __pfx___driver_attach+0x10/0x10 __driver_probe_device+0x73/0x150 driver_probe_device+0x19/0xa0 __driver_attach+0xb6/0x180 ? __pfx___driver_attach+0x10/0x10 bus_for_each_dev+0x77/0xd0 bus_add_driver+0x114/0x210 driver_register+0x5b/0x110 ? __pfx_intel_lpss_pci_driver_init+0x10/0x10 [intel_lpss_pci] do_one_initcall+0x57/0x2b0 ? kmalloc_trace+0x21e/0x280 ? do_init_module+0x1e/0x210 do_init_module+0x5f/0x210 load_module+0x1d37/0x1fc0 ? init_module_from_file+0x86/0xd0 init_module_from_file+0x86/0xd0 idempotent_init_module+0x17c/0x230 __x64_sys_finit_module+0x56/0xb0 do_syscall_64+0x6e/0x140 entry_SYSCALL_64_after_hwframe+0x71/0x79 Fixes: 1a75cc710b95 ("iommu/vt-d: Use rbtree to track iommu probed devices") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10689 Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20240407011429.136282-1-baolu.lu@linux.intel.com Reviewed-by: Kevin Tian Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 50eb9aed47cc..a7ecd90303dc 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4299,9 +4299,11 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) } dev_iommu_priv_set(dev, info); - ret = device_rbtree_insert(iommu, info); - if (ret) - goto free; + if (pdev && pci_ats_supported(pdev)) { + ret = device_rbtree_insert(iommu, info); + if (ret) + goto free; + } if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { ret = intel_pasid_alloc_table(dev); @@ -4336,7 +4338,8 @@ static void intel_iommu_release_device(struct device *dev) struct intel_iommu *iommu = info->iommu; mutex_lock(&iommu->iopf_lock); - device_rbtree_remove(info); + if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) + device_rbtree_remove(info); mutex_unlock(&iommu->iopf_lock); if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && From b8246a2ad80a810cafbeddb30525278f9d64bca3 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 10 Apr 2024 10:16:43 +0000 Subject: [PATCH 186/270] iommu/amd: Change log message severity Use consistent log severity (pr_warn) to log all messages in SNP enable path. Suggested-by: Tom Lendacky Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20240410101643.32309-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index a714eb08c08b..ac6754a85f35 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3235,7 +3235,7 @@ static void iommu_snp_enable(void) * configured with V1 page table (DTE[Mode] = 0 is not supported). */ if (no_iommu || iommu_default_passthrough()) { - pr_err("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); + pr_warn("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); goto disable_snp; } @@ -3246,7 +3246,7 @@ static void iommu_snp_enable(void) amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) { - pr_err("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); + pr_warn("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); goto disable_snp; } From e4a6bceac98eba3c00e874892736b34ea5fdaca3 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Apr 2024 16:26:28 -0700 Subject: [PATCH 187/270] selftests: timers: Fix posix_timers ksft_print_msg() warning After commit 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") the following warning occurs when building with an older gcc: posix_timers.c:250:2: warning: format not a string literal and no format arguments [-Wformat-security] 250 | ksft_print_msg(errmsg); | ^~~~~~~~~~~~~~ Fix this up by changing it to ksft_print_msg("%s", errmsg) Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Acked-by: Justin Stitt Acked-by: Shuah Khan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240410232637.4135564-1-jstultz@google.com --- tools/testing/selftests/timers/posix_timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index d86a0e00711e..348f47176e0a 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -247,7 +247,7 @@ static int check_timer_distribution(void) ksft_test_result_skip("check signal distribution (old kernel)\n"); return 0; err: - ksft_print_msg(errmsg); + ksft_print_msg("%s", errmsg); return -1; } From f7d5bcd35d427daac7e206b1073ca14f5db85c27 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 11 Apr 2024 11:45:40 -0700 Subject: [PATCH 188/270] selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn After commit 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()"), clang warns: tools/testing/selftests/timers/../kselftest.h:398:6: warning: variable 'major' is used uninitialized whenever '||' condition is true [-Wsometimes-uninitialized] 398 | if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) | ^~~~~~~~~~~~ tools/testing/selftests/timers/../kselftest.h:401:9: note: uninitialized use occurs here 401 | return major > min_major || (major == min_major && minor >= min_minor); | ^~~~~ tools/testing/selftests/timers/../kselftest.h:398:6: note: remove the '||' if its condition is always false 398 | if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) | ^~~~~~~~~~~~~~~ tools/testing/selftests/timers/../kselftest.h:395:20: note: initialize the variable 'major' to silence this warning 395 | unsigned int major, minor; | ^ | = 0 This is a false positive because if uname() fails, ksft_exit_fail_msg() will be called, which unconditionally calls exit(), a noreturn function. However, clang does not know that ksft_exit_fail_msg() will call exit() at the point in the pipeline that the warning is emitted because inlining has not occurred, so it assumes control flow will resume normally after ksft_exit_fail_msg() is called. Make it clear to clang that all of the functions that call exit() unconditionally in kselftest.h are noreturn transitively by marking them explicitly with '__attribute__((__noreturn__))', which clears up the warning above and any future warnings that may appear for the same reason. Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Reported-by: John Stultz Signed-off-by: Nathan Chancellor Signed-off-by: Thomas Gleixner Acked-by: Shuah Khan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240411-mark-kselftest-exit-funcs-noreturn-v1-1-b027c948f586@kernel.org Closes: https://lore.kernel.org/all/20240410232637.4135564-2-jstultz@google.com/ --- tools/testing/selftests/kselftest.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 973b18e156b2..0591974b57e0 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -80,6 +80,9 @@ #define KSFT_XPASS 3 #define KSFT_SKIP 4 +#ifndef __noreturn +#define __noreturn __attribute__((__noreturn__)) +#endif #define __printf(a, b) __attribute__((format(printf, a, b))) /* counters */ @@ -300,13 +303,13 @@ void ksft_test_result_code(int exit_code, const char *test_name, va_end(args); } -static inline int ksft_exit_pass(void) +static inline __noreturn int ksft_exit_pass(void) { ksft_print_cnts(); exit(KSFT_PASS); } -static inline int ksft_exit_fail(void) +static inline __noreturn int ksft_exit_fail(void) { ksft_print_cnts(); exit(KSFT_FAIL); @@ -333,7 +336,7 @@ static inline int ksft_exit_fail(void) ksft_cnt.ksft_xfail + \ ksft_cnt.ksft_xskip) -static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) +static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; va_list args; @@ -348,19 +351,19 @@ static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) exit(KSFT_FAIL); } -static inline int ksft_exit_xfail(void) +static inline __noreturn int ksft_exit_xfail(void) { ksft_print_cnts(); exit(KSFT_XFAIL); } -static inline int ksft_exit_xpass(void) +static inline __noreturn int ksft_exit_xpass(void) { ksft_print_cnts(); exit(KSFT_XPASS); } -static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...) +static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) { int saved_errno = errno; va_list args; From ed366de8ec89d4f960d66c85fc37d9de22f7bf6d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Apr 2024 16:26:30 -0700 Subject: [PATCH 189/270] selftests: timers: Fix abs() warning in posix_timers test Building with clang results in the following warning: posix_timers.c:69:6: warning: absolute value function 'abs' given an argument of type 'long long' but has parameter of type 'int' which may cause truncation of value [-Wabsolute-value] if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { ^ So switch to using llabs() instead. Fixes: 0bc4b0cf1570 ("selftests: add basic posix timers selftests") Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240410232637.4135564-3-jstultz@google.com --- tools/testing/selftests/timers/posix_timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 348f47176e0a..c001dd79179d 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -66,7 +66,7 @@ static int check_diff(struct timeval start, struct timeval end) diff = end.tv_usec - start.tv_usec; diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC; - if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { + if (llabs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { printf("Diff too high: %lld..", diff); return -1; } From 3ec4848913d695245716ea45ca4872d9dff097a5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 11 Apr 2024 11:23:48 +0800 Subject: [PATCH 190/270] block: fix that blk_time_get_ns() doesn't update time after schedule While monitoring the throttle time of IO from iocost, it's found that such time is always zero after the io_schedule() from ioc_rqos_throttle, for example, with the following debug patch: + printk("%s-%d: %s enter %llu\n", current->comm, current->pid, __func__, blk_time_get_ns()); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } + printk("%s-%d: %s exit %llu\n", current->comm, current->pid, __func__, blk_time_get_ns()); It can be observerd that blk_time_get_ns() always return the same time: [ 1068.096579] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.272587] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.274389] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.472690] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.474485] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.672656] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.674451] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.872655] fio-1268: ioc_rqos_throttle exit 1067901962288 And I think the root cause is that 'PF_BLOCK_TS' is always cleared by blk_flush_plug() before scheduel(), hence blk_plug_invalidate_ts() will never be called: blk_time_get_ns plug->cur_ktime = ktime_get_ns(); current->flags |= PF_BLOCK_TS; io_schedule: io_schedule_prepare blk_flush_plug __blk_flush_plug /* the flag is cleared, while time is not */ current->flags &= ~PF_BLOCK_TS; schedule sched_update_worker /* the flag is not set, hence plug->cur_ktime is not cleared */ if (tsk->flags & PF_BLOCK_TS) blk_plug_invalidate_ts() blk_time_get_ns /* got the time stashed before schedule */ return plug->cur_ktime; Fix the problem by clearing cached time in __blk_flush_plug(). Fixes: 06b23f92af87 ("block: update cached timestamp post schedule/preemption") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240411032349.3051233-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-core.c b/block/blk-core.c index 3a6f5603fb44..b795ac177281 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1197,6 +1197,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; } From 16767502aa990cca2cb7d1372b31d328c4c85b40 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 12 Apr 2024 14:35:36 +0200 Subject: [PATCH 191/270] selftests: kselftest: Fix build failure with NOLIBC As Mark explains ksft_min_kernel_version() can't be compiled with nolibc, it doesn't implement uname(). Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Reported-by: Mark Brown Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240412123536.GA32444@redhat.com Closes: https://lore.kernel.org/all/f0523b3a-ea08-4615-b0fb-5b504a2d39df@sirena.org.uk/ --- tools/testing/selftests/kselftest.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 0591974b57e0..7b89362da4bf 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -395,6 +395,10 @@ static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) static inline int ksft_min_kernel_version(unsigned int min_major, unsigned int min_minor) { +#ifdef NOLIBC + ksft_print_msg("NOLIBC: Can't check kernel version: Function not implemented\n"); + return 0; +#else unsigned int major, minor; struct utsname info; @@ -402,6 +406,7 @@ static inline int ksft_min_kernel_version(unsigned int min_major, ksft_exit_fail_msg("Can't parse kernel version\n"); return major > min_major || (major == min_major && minor >= min_minor); +#endif } #endif /* __KSELFTEST_H */ From d5cf50dafc9dd5faa1e61e7021e3496ddf7fd61e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 12 Apr 2024 10:05:10 -0700 Subject: [PATCH 192/270] Kconfig: add some hidden tabs on purpose Commit d96c36004e31 ("tracing: Fix FTRACE_RECORD_RECURSION_SIZE Kconfig entry") removed a hidden tab because it apparently showed breakage in some third-party kernel config parsing tool. It wasn't clear what tool it was, but let's make sure it gets fixed. Because if you can't parse tabs as whitespace, you should not be parsing the kernel Kconfig files. In fact, let's make such breakage more obvious than some esoteric ftrace record size option. If you can't parse tabs, you can't have page sizes. Yes, tab-vs-space confusion is sadly a traditional Unix thing, and 'make' is famous for being broken in this regard. But no, that does not mean that it's ok. I'd add more random tabs to our Kconfig files, but I don't want to make things uglier than necessary. But it *might* bbe necessary if it turns out we see more of this kind of silly tooling. Fixes: d96c36004e31 ("tracing: Fix FTRACE_RECORD_RECURSION_SIZE Kconfig entry") Link: https://lore.kernel.org/lkml/CAHk-=wj-hLLN_t_m5OL4dXLaxvXKy_axuoJYXif7iczbfgAevQ@mail.gmail.com/ Signed-off-by: Linus Torvalds --- arch/Kconfig | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 9f066785bb71..65afb1de48b3 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1172,12 +1172,12 @@ config PAGE_SIZE_LESS_THAN_256KB config PAGE_SHIFT int - default 12 if PAGE_SIZE_4KB - default 13 if PAGE_SIZE_8KB - default 14 if PAGE_SIZE_16KB - default 15 if PAGE_SIZE_32KB - default 16 if PAGE_SIZE_64KB - default 18 if PAGE_SIZE_256KB + default 12 if PAGE_SIZE_4KB + default 13 if PAGE_SIZE_8KB + default 14 if PAGE_SIZE_16KB + default 15 if PAGE_SIZE_32KB + default 16 if PAGE_SIZE_64KB + default 18 if PAGE_SIZE_256KB # This allows to use a set of generic functions to determine mmap base # address by giving priority to top-down scheme only if the process From 11baa36d317321f5d54059f07d243c5a1dbbfbb2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 10 Apr 2024 19:03:05 +0200 Subject: [PATCH 193/270] gpio: lpc32xx: fix module autoloading Add MODULE_DEVICE_TABLE(), so the module could be properly autoloaded based on the alias from of_device_id table. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-lpc32xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-lpc32xx.c b/drivers/gpio/gpio-lpc32xx.c index 5ef8af824980..c097e310c9e8 100644 --- a/drivers/gpio/gpio-lpc32xx.c +++ b/drivers/gpio/gpio-lpc32xx.c @@ -529,6 +529,7 @@ static const struct of_device_id lpc32xx_gpio_of_match[] = { { .compatible = "nxp,lpc3220-gpio", }, { }, }; +MODULE_DEVICE_TABLE(of, lpc32xx_gpio_of_match); static struct platform_driver lpc32xx_gpio_driver = { .driver = { From 79336504781e7fee5ddaf046dcc186c8dfdf60b1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 12 Apr 2024 08:41:15 +0900 Subject: [PATCH 194/270] ata: libata-scsi: Fix ata_scsi_dev_rescan() error path Commit 0c76106cb975 ("scsi: sd: Fix TCG OPAL unlock on system resume") incorrectly handles failures of scsi_resume_device() in ata_scsi_dev_rescan(), leading to a double call to spin_unlock_irqrestore() to unlock a device port. Fix this by redefining the goto labels used in case of errors and only unlock the port scsi_scan_mutex when scsi_resume_device() fails. Bug found with the Smatch static checker warning: drivers/ata/libata-scsi.c:4774 ata_scsi_dev_rescan() error: double unlocked 'ap->lock' (orig line 4757) Reported-by: Dan Carpenter Fixes: 0c76106cb975 ("scsi: sd: Fix TCG OPAL unlock on system resume") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel --- drivers/ata/libata-scsi.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 2f4c58837641..e954976891a9 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -4745,7 +4745,7 @@ void ata_scsi_dev_rescan(struct work_struct *work) * bail out. */ if (ap->pflags & ATA_PFLAG_SUSPENDED) - goto unlock; + goto unlock_ap; if (!sdev) continue; @@ -4758,7 +4758,7 @@ void ata_scsi_dev_rescan(struct work_struct *work) if (do_resume) { ret = scsi_resume_device(sdev); if (ret == -EWOULDBLOCK) - goto unlock; + goto unlock_scan; dev->flags &= ~ATA_DFLAG_RESUMING; } ret = scsi_rescan_device(sdev); @@ -4766,12 +4766,13 @@ void ata_scsi_dev_rescan(struct work_struct *work) spin_lock_irqsave(ap->lock, flags); if (ret) - goto unlock; + goto unlock_ap; } } -unlock: +unlock_ap: spin_unlock_irqrestore(ap->lock, flags); +unlock_scan: mutex_unlock(&ap->scsi_scan_mutex); /* Reschedule with a delay if scsi_rescan_device() returned an error */ From c0297e7dd50795d559f3534887a6de1756b35d0f Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Thu, 11 Apr 2024 20:12:24 +0000 Subject: [PATCH 195/270] ata: libata-core: Allow command duration limits detection for ACS-4 drives Even though the command duration limits (CDL) feature was first added in ACS-5 (major version 12), there are some ACS-4 (major version 11) drives that implement CDL as well. IDENTIFY_DEVICE, SUPPORTED_CAPABILITIES, and CURRENT_SETTINGS log pages are mandatory in the ACS-4 standard so it should be safe to read these log pages on older drives implementing the ACS-4 standard. Fixes: 62e4a60e0cdb ("scsi: ata: libata: Detect support for command duration limits") Cc: stable@vger.kernel.org Signed-off-by: Igor Pylypiv Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index be3412cdb22e..c449d60d9bb9 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2539,7 +2539,7 @@ static void ata_dev_config_cdl(struct ata_device *dev) bool cdl_enabled; u64 val; - if (ata_id_major_version(dev->id) < 12) + if (ata_id_major_version(dev->id) < 11) goto not_supported; if (!ata_log_supported(dev, ATA_LOG_IDENTIFY_DEVICE) || From 283454c8a123072e5c386a5a2b5fc576aa455b6f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Apr 2024 10:10:15 -0700 Subject: [PATCH 196/270] af_unix: Call manage_oob() for every skb in unix_stream_read_generic(). When we call recv() for AF_UNIX socket, we first peek one skb and calls manage_oob() to check if the skb is sent with MSG_OOB. However, when we fetch the next (and the following) skb, manage_oob() is not called now, leading a wrong behaviour. Let's say a socket send()s "hello" with MSG_OOB and the peer tries to recv() 5 bytes with MSG_PEEK. Here, we should get only "hell" without 'o', but actually not: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hello', MSG_OOB) 5 >>> c2.recv(5, MSG_PEEK) b'hello' The first skb fills 4 bytes, and the next skb is peeked but not properly checked by manage_oob(). Let's move up the again label to call manage_oob() for evry skb. With this patch: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hello', MSG_OOB) 5 >>> c2.recv(5, MSG_PEEK) b'hell' Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240410171016.7621-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d032eb5fa6df..f297320438bf 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2741,6 +2741,7 @@ redo: last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; +again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); @@ -2752,7 +2753,6 @@ redo: } } #endif -again: if (skb == NULL) { if (copied >= target) goto unlock; From 22dd70eb2c3d754862964377a75abafd3167346b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Apr 2024 10:10:16 -0700 Subject: [PATCH 197/270] af_unix: Don't peek OOB data without MSG_OOB. Currently, we can read OOB data without MSG_OOB by using MSG_PEEK when OOB data is sitting on the front row, which is apparently wrong. >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) b'a' If manage_oob() is called when no data has been copied, we only check if the socket enables SO_OOBINLINE or MSG_PEEK is not used. Otherwise, the skb is returned as is. However, here we should return NULL if MSG_PEEK is set and no data has been copied. Also, in such a case, we should not jump to the redo label because we will be caught in the loop and hog the CPU until normal data comes in. Then, we need to handle skb == NULL case with the if-clause below the manage_oob() block. With this patch: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) Traceback (most recent call last): File "", line 1, in BlockingIOError: [Errno 11] Resource temporarily unavailable Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240410171016.7621-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f297320438bf..9a6ad5974dff 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2663,7 +2663,9 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); } - } else if (!(flags & MSG_PEEK)) { + } else if (flags & MSG_PEEK) { + skb = NULL; + } else { skb_unlink(skb, &sk->sk_receive_queue); WRITE_ONCE(u->oob_skb, NULL); if (!WARN_ON_ONCE(skb_unref(skb))) @@ -2745,11 +2747,9 @@ again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); - if (!skb) { + if (!skb && copied) { unix_state_unlock(sk); - if (copied) - break; - goto redo; + break; } } #endif From 68aba00483c7c4102429bcdfdece7289a8ab5c8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Thu, 11 Apr 2024 11:13:18 +0000 Subject: [PATCH 198/270] net: sparx5: flower: fix fragment flags handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I noticed that only 3 out of the 4 input bits were used, mt.key->flags & FLOW_DIS_IS_FRAGMENT was never checked. In order to avoid a complicated maze, I converted it to use a 16 byte mapping table. As shown in the table below the old heuristics doesn't always do the right thing, ie. when FLOW_DIS_IS_FRAGMENT=1/1 then it used to only match follow-up fragment packets. Here are all the combinations, and their resulting new/old VCAP key/mask filter: /- FLOW_DIS_IS_FRAGMENT (key/mask) | /- FLOW_DIS_FIRST_FRAG (key/mask) | | /-- new VCAP fragment (key/mask) v v v v- old VCAP fragment (key/mask) 0/0 0/0 -/- -/- impossible (due to entry cond. on mask) 0/0 0/1 -/- 0/3 !! invalid (can't match non-fragment + follow-up frag) 0/0 1/0 -/- -/- impossible (key > mask) 0/0 1/1 1/3 1/3 first fragment 0/1 0/0 0/3 3/3 !! not fragmented 0/1 0/1 0/3 3/3 !! not fragmented (+ not first fragment) 0/1 1/0 -/- -/- impossible (key > mask) 0/1 1/1 -/- 1/3 !! invalid (non-fragment and first frag) 1/0 0/0 -/- -/- impossible (key > mask) 1/0 0/1 -/- -/- impossible (key > mask) 1/0 1/0 -/- -/- impossible (key > mask) 1/0 1/1 -/- -/- impossible (key > mask) 1/1 0/0 1/1 3/3 !! some fragment 1/1 0/1 3/3 3/3 follow-up fragment 1/1 1/0 -/- -/- impossible (key > mask) 1/1 1/1 1/3 1/3 first fragment In the datasheet the VCAP fragment values are documented as: 0 = no fragment 1 = initial fragment 2 = suspicious fragment 3 = valid follow-up fragment Result: 3 combinations match the old behavior, 3 combinations have been corrected, 2 combinations are now invalid, and fail, 8 combinations are impossible. It should now be aligned with how FLOW_DIS_IS_FRAGMENT and FLOW_DIS_FIRST_FRAG is set in __skb_flow_dissect() in net/core/flow_dissector.c Since the VCAP fragment values are not a bitfield, we have to ignore the suspicious fragment value, eg. when matching on any kind of fragment with FLOW_DIS_IS_FRAGMENT=1/1. Only compile tested, and logic tested in userspace, as I unfortunately don't have access to this switch chip (yet). Fixes: d6c2964db3fe ("net: microchip: sparx5: Adding more tc flower keys for the IS2 VCAP") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Steen Hegelund Tested-by: Daniel Machon Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20240411111321.114095-1-ast@fiberby.net Signed-off-by: Jakub Kicinski --- .../microchip/sparx5/sparx5_tc_flower.c | 61 ++++++++++++------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c b/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c index 523e0c470894..55f255a3c9db 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c @@ -36,6 +36,27 @@ struct sparx5_tc_flower_template { u16 l3_proto; /* protocol specified in the template */ }; +/* SparX-5 VCAP fragment types: + * 0 = no fragment, 1 = initial fragment, + * 2 = suspicious fragment, 3 = valid follow-up fragment + */ +enum { /* key / mask */ + FRAG_NOT = 0x03, /* 0 / 3 */ + FRAG_SOME = 0x11, /* 1 / 1 */ + FRAG_FIRST = 0x13, /* 1 / 3 */ + FRAG_LATER = 0x33, /* 3 / 3 */ + FRAG_INVAL = 0xff, /* invalid */ +}; + +/* Flower fragment flag to VCAP fragment type mapping */ +static const u8 sparx5_vcap_frag_map[4][4] = { /* is_frag */ + { FRAG_INVAL, FRAG_INVAL, FRAG_INVAL, FRAG_FIRST }, /* 0/0 */ + { FRAG_NOT, FRAG_NOT, FRAG_INVAL, FRAG_INVAL }, /* 0/1 */ + { FRAG_INVAL, FRAG_INVAL, FRAG_INVAL, FRAG_INVAL }, /* 1/0 */ + { FRAG_SOME, FRAG_LATER, FRAG_INVAL, FRAG_FIRST } /* 1/1 */ + /* 0/0 0/1 1/0 1/1 <-- first_frag */ +}; + static int sparx5_tc_flower_es0_tpid(struct vcap_tc_flower_parse_usage *st) { @@ -145,29 +166,27 @@ sparx5_tc_flower_handler_control_usage(struct vcap_tc_flower_parse_usage *st) flow_rule_match_control(st->frule, &mt); if (mt.mask->flags) { - if (mt.mask->flags & FLOW_DIS_FIRST_FRAG) { - if (mt.key->flags & FLOW_DIS_FIRST_FRAG) { - value = 1; /* initial fragment */ - mask = 0x3; - } else { - if (mt.mask->flags & FLOW_DIS_IS_FRAGMENT) { - value = 3; /* follow up fragment */ - mask = 0x3; - } else { - value = 0; /* no fragment */ - mask = 0x3; - } - } - } else { - if (mt.mask->flags & FLOW_DIS_IS_FRAGMENT) { - value = 3; /* follow up fragment */ - mask = 0x3; - } else { - value = 0; /* no fragment */ - mask = 0x3; - } + u8 is_frag_key = !!(mt.key->flags & FLOW_DIS_IS_FRAGMENT); + u8 is_frag_mask = !!(mt.mask->flags & FLOW_DIS_IS_FRAGMENT); + u8 is_frag_idx = (is_frag_key << 1) | is_frag_mask; + + u8 first_frag_key = !!(mt.key->flags & FLOW_DIS_FIRST_FRAG); + u8 first_frag_mask = !!(mt.mask->flags & FLOW_DIS_FIRST_FRAG); + u8 first_frag_idx = (first_frag_key << 1) | first_frag_mask; + + /* Lookup verdict based on the 2 + 2 input bits */ + u8 vdt = sparx5_vcap_frag_map[is_frag_idx][first_frag_idx]; + + if (vdt == FRAG_INVAL) { + NL_SET_ERR_MSG_MOD(st->fco->common.extack, + "Match on invalid fragment flag combination"); + return -EINVAL; } + /* Extract VCAP fragment key and mask from verdict */ + value = (vdt >> 4) & 0x3; + mask = vdt & 0x3; + err = vcap_rule_add_key_u32(st->vrule, VCAP_KF_L3_FRAGMENT_TYPE, value, mask); From 37cc10da3a50e6d0cb9808a90b7da9b4868794dd Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 11 Apr 2024 14:54:39 +0300 Subject: [PATCH 199/270] net/mlx5: Lag, restore buckets number to default after hash LAG deactivation The cited patch introduces the concept of buckets in LAG in hash mode. However, the patch doesn't clear the number of buckets in the LAG deactivation. This results in using the wrong number of buckets in case user create a hash mode LAG and afterwards create a non-hash mode LAG. Hence, restore buckets number to default after hash mode LAG deactivation. Fixes: 352899f384d4 ("net/mlx5: Lag, use buckets in hash mode") Signed-off-by: Shay Drory Reviewed-by: Maor Gottlieb Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index d14459e5c04f..69d482f7c5a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -703,8 +703,10 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev) return err; } - if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) { mlx5_lag_port_sel_destroy(ldev); + ldev->buckets = 1; + } if (mlx5_lag_has_drop_rule(ldev)) mlx5_lag_drop_rule_cleanup(ldev); From aa4ac90d04f4371466000825adb44935ecb5c974 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 11 Apr 2024 14:54:40 +0300 Subject: [PATCH 200/270] net/mlx5: SD, Handle possible devcom ERR_PTR Check if devcom holds an error pointer and return immediately. This fixes Smatch static checker warning: drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c:221 sd_register() error: 'devcom' dereferencing possible ERR_PTR() Enhance mlx5_devcom_register_component() so it stops returning NULL, making it easier for its callers. Fixes: d3d057666090 ("net/mlx5: SD, Implement devcom communication and primary election") Reported-by: Dan Carpenter Link: https://lore.kernel.org/all/f09666c8-e604-41f6-958b-4cc55c73faf9@gmail.com/T/ Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20240411115444.374475-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b375ef268671..319930c04093 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -209,8 +209,8 @@ static int mlx5e_devcom_init_mpv(struct mlx5e_priv *priv, u64 *data) *data, mlx5e_devcom_event_mpv, priv); - if (IS_ERR_OR_NULL(priv->devcom)) - return -EOPNOTSUPP; + if (IS_ERR(priv->devcom)) + return PTR_ERR(priv->devcom); if (mlx5_core_is_mp_master(priv->mdev)) { mlx5_devcom_send_event(priv->devcom, MPV_DEVCOM_MASTER_UP, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 1f60954c12f7..844d3e3a65dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3060,7 +3060,7 @@ void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, u64 key) key, mlx5_esw_offloads_devcom_event, esw); - if (IS_ERR_OR_NULL(esw->devcom)) + if (IS_ERR(esw->devcom)) return; mlx5_devcom_send_event(esw->devcom, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c index e7d59cfa8708..7b0766c89f4c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c @@ -220,7 +220,7 @@ mlx5_devcom_register_component(struct mlx5_devcom_dev *devc, struct mlx5_devcom_comp *comp; if (IS_ERR_OR_NULL(devc)) - return NULL; + return ERR_PTR(-EINVAL); mutex_lock(&comp_list_lock); comp = devcom_component_get(devc, id, key, handler); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 5b28084e8a03..dd5d186dc614 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -213,8 +213,8 @@ static int sd_register(struct mlx5_core_dev *dev) sd = mlx5_get_sd(dev); devcom = mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_SD_GROUP, sd->group_id, NULL, dev); - if (!devcom) - return -ENOMEM; + if (IS_ERR(devcom)) + return PTR_ERR(devcom); sd->devcom = devcom; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 59806553889e..a39c4b25ba28 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -956,7 +956,7 @@ static void mlx5_register_hca_devcom_comp(struct mlx5_core_dev *dev) mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_HCA_PORTS, mlx5_query_nic_system_image_guid(dev), NULL, dev); - if (IS_ERR_OR_NULL(dev->priv.hca_devcom_comp)) + if (IS_ERR(dev->priv.hca_devcom_comp)) mlx5_core_err(dev, "Failed to register devcom HCA component\n"); } From bf729988303a27833a86acb561f42b9a3cc12728 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 11 Apr 2024 14:54:41 +0300 Subject: [PATCH 201/270] net/mlx5: Restore mistakenly dropped parts in register devlink flow Code parts from cited commit were mistakenly dropped while rebasing before submission. Add them here. Fixes: c6e77aa9dd82 ("net/mlx5: Register devlink first under devlink lock") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 5 ++++- drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a39c4b25ba28..331ce47f51a1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1699,12 +1699,15 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev) err = mlx5_devlink_params_register(priv_to_devlink(dev)); if (err) { mlx5_core_warn(dev, "mlx5_devlink_param_reg err = %d\n", err); - goto query_hca_caps_err; + goto params_reg_err; } devl_unlock(devlink); return 0; +params_reg_err: + devl_unregister(devlink); + devl_unlock(devlink); query_hca_caps_err: devl_unregister(devlink); devl_unlock(devlink); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index e3bf8c7e4baa..7ebe71280827 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -75,7 +75,6 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia goto peer_devlink_set_err; } - devlink_register(devlink); return 0; peer_devlink_set_err: From 6c685bdb9e1af966ec0278dbd4068ec39ae88c2d Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Thu, 11 Apr 2024 14:54:42 +0300 Subject: [PATCH 202/270] net/mlx5e: Use channel mdev reference instead of global mdev instance for coalescing Channels can potentially have independent mdev instances. Do not refer to the global mdev instance in the mlx5e_priv instance for channel FW operations related to coalescing. CQ numbers that would be valid on the channel's mdev instance may not be correctly referenced if using the mlx5e_priv instance. Fixes: 67936e138586 ("net/mlx5e: Let channels be SD-aware") Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 8f101181648c..67a29826bb57 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -589,12 +589,12 @@ static int mlx5e_get_coalesce(struct net_device *netdev, static void mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { - struct mlx5_core_dev *mdev = priv->mdev; int tc; int i; for (i = 0; i < priv->channels.num; ++i) { struct mlx5e_channel *c = priv->channels.c[i]; + struct mlx5_core_dev *mdev = c->mdev; for (tc = 0; tc < c->num_tc; tc++) { mlx5_core_modify_cq_moderation(mdev, @@ -608,11 +608,11 @@ mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coal static void mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { - struct mlx5_core_dev *mdev = priv->mdev; int i; for (i = 0; i < priv->channels.num; ++i) { struct mlx5e_channel *c = priv->channels.c[i]; + struct mlx5_core_dev *mdev = c->mdev; mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq, coal->rx_coalesce_usecs, From fdce06bda7e56b2a34c53ea58bad7af2fae96da1 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Apr 2024 14:54:43 +0300 Subject: [PATCH 203/270] net/mlx5e: Acquire RTNL lock before RQs/SQs activation/deactivation netif_queue_set_napi asserts whether RTNL lock is held if the netdev is initialized. Acquire the RTNL lock before activating or deactivating RQs/SQs if the lock has not been held before in the flow. Fixes: f25e7b82635f ("net/mlx5e: link NAPI instances to queues and IRQs") Cc: Joe Damato Signed-off-by: Carolina Jubran Reviewed-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 0ab9db319530..22918b2ef7f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -108,7 +108,10 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) mlx5e_reset_txqsq_cc_pc(sq); sq->stats->recover++; clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); + rtnl_lock(); mlx5e_activate_txqsq(sq); + rtnl_unlock(); + if (sq->channel) mlx5e_trigger_napi_icosq(sq->channel); else @@ -179,12 +182,16 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) carrier_ok = netif_carrier_ok(netdev); netif_carrier_off(netdev); + rtnl_lock(); mlx5e_deactivate_priv_channels(priv); + rtnl_unlock(); mlx5e_ptp_close(chs->ptp); err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp); + rtnl_lock(); mlx5e_activate_priv_channels(priv); + rtnl_unlock(); /* return carrier back if needed */ if (carrier_ok) From fef965764cf562f28afb997b626fc7c3cec99693 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Apr 2024 14:54:44 +0300 Subject: [PATCH 204/270] net/mlx5e: Prevent deadlock while disabling aRFS When disabling aRFS under the `priv->state_lock`, any scheduled aRFS works are canceled using the `cancel_work_sync` function, which waits for the work to end if it has already started. However, while waiting for the work handler, the handler will try to acquire the `state_lock` which is already acquired. The worker acquires the lock to delete the rules if the state is down, which is not the worker's responsibility since disabling aRFS deletes the rules. Add an aRFS state variable, which indicates whether the aRFS is enabled and prevent adding rules when the aRFS is disabled. Kernel log: ====================================================== WARNING: possible circular locking dependency detected 6.7.0-rc4_net_next_mlx5_5483eb2 #1 Tainted: G I ------------------------------------------------------ ethtool/386089 is trying to acquire lock: ffff88810f21ce68 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}, at: __flush_work+0x74/0x4e0 but task is already holding lock: ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&priv->state_lock){+.+.}-{3:3}: __mutex_lock+0x80/0xc90 arfs_handle_work+0x4b/0x3b0 [mlx5_core] process_one_work+0x1dc/0x4a0 worker_thread+0x1bf/0x3c0 kthread+0xd7/0x100 ret_from_fork+0x2d/0x50 ret_from_fork_asm+0x11/0x20 -> #0 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}: __lock_acquire+0x17b4/0x2c80 lock_acquire+0xd0/0x2b0 __flush_work+0x7a/0x4e0 __cancel_work_timer+0x131/0x1c0 arfs_del_rules+0x143/0x1e0 [mlx5_core] mlx5e_arfs_disable+0x1b/0x30 [mlx5_core] mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core] ethnl_set_channels+0x28f/0x3b0 ethnl_default_set_doit+0xec/0x240 genl_family_rcv_msg_doit+0xd0/0x120 genl_rcv_msg+0x188/0x2c0 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1a1/0x270 netlink_sendmsg+0x214/0x460 __sock_sendmsg+0x38/0x60 __sys_sendto+0x113/0x170 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&priv->state_lock); lock((work_completion)(&rule->arfs_work)); lock(&priv->state_lock); lock((work_completion)(&rule->arfs_work)); *** DEADLOCK *** 3 locks held by ethtool/386089: #0: ffffffff82ea7210 (cb_lock){++++}-{3:3}, at: genl_rcv+0x15/0x40 #1: ffffffff82e94c88 (rtnl_mutex){+.+.}-{3:3}, at: ethnl_default_set_doit+0xd3/0x240 #2: ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core] stack backtrace: CPU: 15 PID: 386089 Comm: ethtool Tainted: G I 6.7.0-rc4_net_next_mlx5_5483eb2 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x60/0xa0 check_noncircular+0x144/0x160 __lock_acquire+0x17b4/0x2c80 lock_acquire+0xd0/0x2b0 ? __flush_work+0x74/0x4e0 ? save_trace+0x3e/0x360 ? __flush_work+0x74/0x4e0 __flush_work+0x7a/0x4e0 ? __flush_work+0x74/0x4e0 ? __lock_acquire+0xa78/0x2c80 ? lock_acquire+0xd0/0x2b0 ? mark_held_locks+0x49/0x70 __cancel_work_timer+0x131/0x1c0 ? mark_held_locks+0x49/0x70 arfs_del_rules+0x143/0x1e0 [mlx5_core] mlx5e_arfs_disable+0x1b/0x30 [mlx5_core] mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core] ethnl_set_channels+0x28f/0x3b0 ethnl_default_set_doit+0xec/0x240 genl_family_rcv_msg_doit+0xd0/0x120 genl_rcv_msg+0x188/0x2c0 ? ethnl_ops_begin+0xb0/0xb0 ? genl_family_rcv_msg_dumpit+0xf0/0xf0 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1a1/0x270 netlink_sendmsg+0x214/0x460 __sock_sendmsg+0x38/0x60 __sys_sendto+0x113/0x170 ? do_user_addr_fault+0x53f/0x8f0 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e Fixes: 45bf454ae884 ("net/mlx5e: Enabling aRFS mechanism") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_arfs.c | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c index c7f542d0b8f0..93cf23278d93 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -46,6 +46,10 @@ struct arfs_table { struct hlist_head rules_hash[ARFS_HASH_SIZE]; }; +enum { + MLX5E_ARFS_STATE_ENABLED, +}; + enum arfs_type { ARFS_IPV4_TCP, ARFS_IPV6_TCP, @@ -60,6 +64,7 @@ struct mlx5e_arfs_tables { spinlock_t arfs_lock; int last_filter_id; struct workqueue_struct *wq; + unsigned long state; }; struct arfs_tuple { @@ -170,6 +175,8 @@ int mlx5e_arfs_enable(struct mlx5e_flow_steering *fs) return err; } } + set_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state); + return 0; } @@ -455,6 +462,8 @@ static void arfs_del_rules(struct mlx5e_flow_steering *fs) int i; int j; + clear_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state); + spin_lock_bh(&arfs->arfs_lock); mlx5e_for_each_arfs_rule(rule, htmp, arfs->arfs_tables, i, j) { hlist_del_init(&rule->hlist); @@ -627,17 +636,8 @@ static void arfs_handle_work(struct work_struct *work) struct mlx5_flow_handle *rule; arfs = mlx5e_fs_get_arfs(priv->fs); - mutex_lock(&priv->state_lock); - if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { - spin_lock_bh(&arfs->arfs_lock); - hlist_del(&arfs_rule->hlist); - spin_unlock_bh(&arfs->arfs_lock); - - mutex_unlock(&priv->state_lock); - kfree(arfs_rule); - goto out; - } - mutex_unlock(&priv->state_lock); + if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state)) + return; if (!arfs_rule->rule) { rule = arfs_add_rule(priv, arfs_rule); @@ -753,6 +753,11 @@ int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, return -EPROTONOSUPPORT; spin_lock_bh(&arfs->arfs_lock); + if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state)) { + spin_unlock_bh(&arfs->arfs_lock); + return -EPERM; + } + arfs_rule = arfs_find_rule(arfs_t, &fk); if (arfs_rule) { if (arfs_rule->rxq == rxq_index || work_busy(&arfs_rule->arfs_work)) { From 58caa786f1c02fd84919fb6db9eaecb22e8f7983 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 17:47:42 -0400 Subject: [PATCH 205/270] bcachefs: Fix UAFs of btree_insert_entry array The btree paths array is now dynamically resizable - and as well the btree_insert_entries array, as it needs to be the same size. The merge path (and interior update path) allocates new btree paths, thus can trigger a resize; thus we need to not retain direct pointers after invoking merge; similarly when running btree node triggers. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index aa9da4970740..9609ad370d71 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -499,9 +499,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - struct btree_insert_entry *btree_id_start) + unsigned btree_id_start) { - struct btree_insert_entry *i; bool trans_trigger_run; int ret, overwrite; @@ -514,13 +513,13 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, do { trans_trigger_run = false; - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + for (unsigned i = btree_id_start; + i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; i++) { - if (i->btree_id != btree_id) + if (trans->updates[i].btree_id != btree_id) continue; - ret = run_one_trans_trigger(trans, i, overwrite); + ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); if (ret < 0) return ret; if (ret) @@ -534,8 +533,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - struct btree_insert_entry *btree_id_start = trans->updates; - unsigned btree_id = 0; + unsigned btree_id = 0, btree_id_start = 0; int ret = 0; /* @@ -549,8 +547,8 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) if (btree_id == BTREE_ID_alloc) continue; - while (btree_id_start < trans->updates + trans->nr_updates && - btree_id_start->btree_id < btree_id) + while (btree_id_start < trans->nr_updates && + trans->updates[btree_id_start].btree_id < btree_id) btree_id_start++; ret = run_btree_triggers(trans, btree_id, btree_id_start); @@ -558,11 +556,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) return ret; } - trans_for_each_update(trans, i) { + for (unsigned idx = 0; idx < trans->nr_updates; idx++) { + struct btree_insert_entry *i = trans->updates + idx; + if (i->btree_id > BTREE_ID_alloc) break; if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, i); + ret = run_btree_triggers(trans, BTREE_ID_alloc, idx); if (ret) return ret; break; @@ -826,7 +826,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags struct bch_fs *c = trans->c; int ret = 0, u64s_delta = 0; - trans_for_each_update(trans, i) { + for (unsigned idx = 0; idx < trans->nr_updates; idx++) { + struct btree_insert_entry *i = trans->updates + idx; if (i->cached) continue; From 031ad9e7dbd18c63e671fc4d98be3082189b8a63 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 21:30:43 -0400 Subject: [PATCH 206/270] bcachefs: Check for packed bkeys that are too big add missing validation; fixes assertion pop in bkey unpack Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.h | 6 ++++++ fs/bcachefs/btree_io.c | 15 ++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index cf23ff47bed8..3a45d128f608 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -314,6 +314,12 @@ static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, return bkey_packed(k) ? format->key_u64s : BKEY_U64s; } +static inline bool bkeyp_u64s_valid(const struct bkey_format *f, + const struct bkey_packed *k) +{ + return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s); +} + static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, const struct bkey_packed *k) { diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index d7de82ac3893..b1aaf0550644 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -831,7 +831,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); } -static bool __bkey_valid(struct bch_fs *c, struct btree *b, +static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, struct bset *i, struct bkey_packed *k) { if (bkey_p_next(k) > vstruct_last(i)) @@ -840,7 +840,7 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b, if (k->format > KEY_FORMAT_CURRENT) return false; - if (k->u64s < bkeyp_key_u64s(&b->format, k)) + if (!bkeyp_u64s_valid(&b->format, k)) return false; struct printbuf buf = PRINTBUF; @@ -884,11 +884,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, "invalid bkey format %u", k->format)) goto drop_this_key; - if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k), + if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, btree_node_bkey_bad_u64s, - "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k))) + "bad k->u64s %u (min %u max %lu)", k->u64s, + bkeyp_key_u64s(&b->format, k), + U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k))) goto drop_this_key; if (!write) @@ -947,13 +949,12 @@ drop_this_key: * do */ - if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { + if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { for (next_good_key = 1; next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; next_good_key++) - if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) + if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) goto got_good_key; - } /* From 87cb0239c87f608fd48fb50f9f53f129dcfd73f4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 23:38:07 -0400 Subject: [PATCH 207/270] bcachefs: btree node scan: handle encrypted nodes Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_node_scan.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 556f76f5c84e..20f2b37c4474 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -133,9 +133,19 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (le64_to_cpu(bn->magic) != bset_magic(c)) return; + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { + struct nonce nonce = btree_nonce(&bn->keys, 0); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + + bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); + } + if (btree_id_is_alloc(BTREE_NODE_ID(bn))) return; + if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), From dc32c118ec6b1032693c489a0aa9e011f0acdb1a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 21:20:27 -0400 Subject: [PATCH 208/270] bcachefs: fix unsafety in bch2_extent_ptr_to_text() Need to check if we have a valid bucket before checking if ptr is stale Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 0e3ca99fbd2d..36d12d2adb81 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -998,7 +998,9 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); - if (ca && ptr_stale(ca, ptr)) + if (b >= ca->mi.first_bucket && + b < ca->mi.nbuckets && + ptr_stale(ca, ptr)) prt_printf(out, " stale"); } } From 2aeed876d7c2c2fb4c6b92f59bdf7e73cfe5e098 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 23:37:24 -0400 Subject: [PATCH 209/270] bcachefs: fix unsafety in bch2_stripe_to_text() .to_text() functions need to work on key values that didn't pass .valid Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 42 +++++++++++++++++++++++------------------- fs/bcachefs/ec.h | 2 ++ 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 082075244e16..c0b2b2f180e0 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -131,29 +131,33 @@ fsck_err: void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned i, nr_data = s->nr_blocks - s->nr_redundant; + const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; + struct bch_stripe s = {}; + + memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); + + unsigned nr_data = s.nr_blocks - s.nr_redundant; prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", - s->algorithm, - le16_to_cpu(s->sectors), - nr_data, - s->nr_redundant, - s->csum_type, - 1U << s->csum_granularity_bits); + s.algorithm, + le16_to_cpu(s.sectors), + nr_data, + s.nr_redundant, + s.csum_type, + 1U << s.csum_granularity_bits); - for (i = 0; i < s->nr_blocks; i++) { - const struct bch_extent_ptr *ptr = s->ptrs + i; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + for (unsigned i = 0; i < s.nr_blocks; i++) { + const struct bch_extent_ptr *ptr = sp->ptrs + i; - prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); - if (i < nr_data) - prt_printf(out, "#%u", stripe_blockcount_get(s, i)); - prt_printf(out, " gen %u", ptr->gen); - if (ptr_stale(ca, ptr)) - prt_printf(out, " stale"); + if ((void *) ptr >= bkey_val_end(k)) + break; + + bch2_extent_ptr_to_text(out, c, ptr); + + if (s.csum_type < BCH_CSUM_NR && + i < nr_data && + stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) + prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); } } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index f4369b02e805..f042616888b0 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -32,6 +32,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) static inline unsigned stripe_csum_offset(const struct bch_stripe *s, unsigned dev, unsigned csum_idx) { + EBUG_ON(s->csum_type >= BCH_CSUM_NR); + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; return sizeof(struct bch_stripe) + From 7b4c4ccf848b359762c1525bab5a12f008f14a65 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 23:58:36 -0400 Subject: [PATCH 210/270] bcachefs: fix race in bch2_btree_node_evict() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 84474324dba9..c7f156320a35 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1148,6 +1148,8 @@ wait_on_io: btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + if (unlikely(b->hash_val != btree_ptr_hash_val(k))) + goto out; if (btree_node_dirty(b)) { __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); @@ -1162,7 +1164,7 @@ wait_on_io: btree_node_data_free(c, b); bch2_btree_node_hash_remove(bc, b); mutex_unlock(&bc->lock); - +out: six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } From ba8ed36e72033dd6b89d44145f323e6c4508bfc9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 00:09:08 -0400 Subject: [PATCH 211/270] bcachefs: don't queue btree nodes for rewrites during scan many nodes found during scan will be old nodes, overwritten by newer nodes Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index b1aaf0550644..9678b2375bed 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1340,7 +1340,9 @@ start: rb->start_time); bio_put(&rb->bio); - if (saw_error && !btree_node_read_error(b)) { + if (saw_error && + !btree_node_read_error(b) && + c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->key.k.p); bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", From 9abb6dd7ce5a261f7aebf6f396b50a63db71133f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 15:17:00 -0400 Subject: [PATCH 212/270] bcachefs: Standardize helpers for printing enum strs with bounds checks Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 4 ++-- fs/bcachefs/buckets.h | 8 -------- fs/bcachefs/checksum.c | 23 ++++++++++++++--------- fs/bcachefs/checksum.h | 5 +++-- fs/bcachefs/compress.h | 8 -------- fs/bcachefs/ec.c | 14 ++++++-------- fs/bcachefs/extents.c | 7 ++++--- fs/bcachefs/journal_io.c | 17 +++++++++-------- fs/bcachefs/opts.c | 29 +++++++++++++++++++++++++---- fs/bcachefs/opts.h | 10 ++++++---- 10 files changed, 69 insertions(+), 56 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 364ae42022af..9480f1b44f07 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1314,7 +1314,7 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(write_buffer_keys, 11) \ x(datetime, 12) -enum { +enum bch_jset_entry_type { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, BCH_JSET_ENTRY_TYPES() #undef x @@ -1360,7 +1360,7 @@ struct jset_entry_blacklist_v2 { x(inodes, 1) \ x(key_version, 2) -enum { +enum bch_fs_usage_type { #define x(f, nr) BCH_FS_USAGE_##f = nr, BCH_FS_USAGE_TYPES() #undef x diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 00aaf4bb5139..f9af5adabe83 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -395,14 +395,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type) : "(invalid data type)"; } -static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type) -{ - if (type < BCH_DATA_NR) - prt_str(out, __bch2_data_types[type]); - else - prt_printf(out, "(invalid data type %u)", type); -} - /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 4701457f6381..7ed779b411f6 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -429,15 +429,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, extent_nonce(version, crc_old), bio); if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { - bch_err(c, "checksum error in %s() (memory corruption or bug?)\n" - "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", - __func__, - crc_old.csum.hi, - crc_old.csum.lo, - merged.hi, - merged.lo, - bch2_csum_types[crc_old.csum_type], - bch2_csum_types[new_csum_type]); + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" + "expected %0llx:%0llx got %0llx:%0llx (old type ", + __func__, + crc_old.csum.hi, + crc_old.csum.lo, + merged.hi, + merged.lo); + bch2_prt_csum_type(&buf, crc_old.csum_type); + prt_str(&buf, " new type "); + bch2_prt_csum_type(&buf, new_csum_type); + prt_str(&buf, ")"); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); return -EIO; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 1b8c2c1016dc..e40499fde9a4 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out, struct bch_csum expected, struct bch_csum got) { - prt_printf(out, "checksum error: got "); + prt_str(out, "checksum error, type "); + bch2_prt_csum_type(out, type); + prt_str(out, ": got "); bch2_csum_to_text(out, type, got); prt_str(out, " should be "); bch2_csum_to_text(out, type, expected); - prt_printf(out, " type %s", bch2_csum_types[type]); } int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 58c2eb45570f..607fd5e232c9 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,14 +47,6 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } -static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type) -{ - if (type < BCH_COMPRESSION_TYPE_NR) - prt_str(out, __bch2_compression_types[type]); - else - prt_printf(out, "(invalid compression type %u)", type); -} - int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c0b2b2f180e0..556a217108d3 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -138,13 +138,13 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, unsigned nr_data = s.nr_blocks - s.nr_redundant; - prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", + prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", s.algorithm, le16_to_cpu(s.sectors), nr_data, - s.nr_redundant, - s.csum_type, - 1U << s.csum_granularity_bits); + s.nr_redundant); + bch2_prt_csum_type(out, s.csum_type); + prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); for (unsigned i = 0; i < s.nr_blocks; i++) { const struct bch_extent_ptr *ptr = sp->ptrs + i; @@ -611,10 +611,8 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct printbuf err = PRINTBUF; struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); - prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n", - want.hi, want.lo, - got.hi, got.lo, - bch2_csum_types[v->csum_type]); + prt_str(&err, "stripe "); + bch2_csum_err_msg(&err, v->csum_type, want, got); prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); bch_err_ratelimited(ca, "%s", err.buf); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 36d12d2adb81..1a331e539204 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1030,11 +1030,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", crc.compressed_size, crc.uncompressed_size, - crc.offset, crc.nonce, - bch2_csum_types[crc.csum_type]); + crc.offset, crc.nonce); + bch2_prt_csum_type(out, crc.csum_type); + prt_str(out, " compress "); bch2_prt_compression_type(out, crc.compression_type); break; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 725fcf46f631..9aa28b52ab92 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -247,7 +247,7 @@ static void journal_entry_err_msg(struct printbuf *out, if (entry) { prt_str(out, " type="); - prt_str(out, bch2_jset_entry_types[entry->type]); + bch2_prt_jset_entry_type(out, entry->type); } if (!jset) { @@ -403,7 +403,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs jset_entry_for_each_key(entry, k) { if (!first) { prt_newline(out); - prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + bch2_prt_jset_entry_type(out, entry->type); + prt_str(out, ": "); } prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); @@ -563,9 +564,9 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); - prt_printf(out, "type=%s v=%llu", - bch2_fs_usage_types[u->entry.btree_id], - le64_to_cpu(u->v)); + prt_str(out, "type="); + bch2_prt_fs_usage_type(out, u->entry.btree_id); + prt_printf(out, " v=%llu", le64_to_cpu(u->v)); } static int journal_entry_data_usage_validate(struct bch_fs *c, @@ -827,11 +828,11 @@ int bch2_journal_entry_validate(struct bch_fs *c, void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { + bch2_prt_jset_entry_type(out, entry->type); + if (entry->type < BCH_JSET_ENTRY_NR) { - prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + prt_str(out, ": "); bch2_jset_entry_ops[entry->type].to_text(out, c, entry); - } else { - prt_printf(out, "(unknown type %u)", entry->type); } } diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index e1800c4119b5..bb068fd72465 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -43,7 +43,7 @@ const char * const __bch2_btree_ids[] = { NULL }; -const char * const bch2_csum_types[] = { +static const char * const __bch2_csum_types[] = { BCH_CSUM_TYPES() NULL }; @@ -53,7 +53,7 @@ const char * const bch2_csum_opts[] = { NULL }; -const char * const __bch2_compression_types[] = { +static const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; @@ -83,18 +83,39 @@ const char * const bch2_member_states[] = { NULL }; -const char * const bch2_jset_entry_types[] = { +static const char * const __bch2_jset_entry_types[] = { BCH_JSET_ENTRY_TYPES() NULL }; -const char * const bch2_fs_usage_types[] = { +static const char * const __bch2_fs_usage_types[] = { BCH_FS_USAGE_TYPES() NULL }; #undef x +static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], + unsigned nr, const char *type, unsigned idx) +{ + if (idx < nr) + prt_str(out, opts[idx]); + else + prt_printf(out, "(unknown %s %u)", type, idx); +} + +#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \ +void bch2_prt_##name(struct printbuf *out, type t) \ +{ \ + prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\ +} + +PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); +PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); +PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); +PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); +PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); + static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) { diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 1ac4135cca1c..84e452835a17 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -16,18 +16,20 @@ extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; -extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; -extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; -extern const char * const bch2_jset_entry_types[]; -extern const char * const bch2_fs_usage_types[]; extern const char * const bch2_d_types[]; +void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); +void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); +void bch2_prt_data_type(struct printbuf *, enum bch_data_type); +void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); +void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); + static inline const char *bch2_d_type_str(unsigned d_type) { return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; From 4518e80adfdbfdec1cc79c98bc73677ff44d96bd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 14:05:36 -0400 Subject: [PATCH 213/270] bcachefs: Go rw if running any explicit recovery passes This fixes a bug where we fail to start when upgrading/downgrading because we forgot we needed to go rw. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index cb501460d615..0cec0f7d9703 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -44,7 +44,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || c->opts.fsck || !c->sb.clean) + if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit) return bch2_fs_read_write_early(c); return 0; } From 82cf18f23e1ae17053983e325e550194f947e1f1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 21:07:05 -0400 Subject: [PATCH 214/270] bcachefs: Fix deadlock in journal replay btree_key_can_insert_cached() should be checking the watermark - BCH_TRANS_COMMIT_journal_replay really means nonblocking mode when watermark < reclaim, it was being used incorrectly. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 9609ad370d71..bbec91e8e650 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -397,12 +397,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags struct bkey_cached *ck = (void *) path->l[0].b; unsigned new_u64s; struct bkey_i *new_k; + unsigned watermark = flags & BCH_WATERMARK_MASK; EBUG_ON(path->level); - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(c) && - !(flags & BCH_TRANS_COMMIT_journal_reclaim)) + if (watermark < BCH_WATERMARK_reclaim && + !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(c)) return -BCH_ERR_btree_insert_need_journal_reclaim; /* From 9e203c43dc1cbaefb3888ee0ba885b2d20d47526 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 00:26:01 -0400 Subject: [PATCH 215/270] bcachefs: Fix missing write refs in fs fio paths bch2_journal_flush_seq requires us to have a write ref Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 ++ fs/bcachefs/fs-io-direct.c | 19 +++++++++++++------ fs/bcachefs/fs-io.c | 16 ++++++++-------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a31a5f706929..91c3c1fef233 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -709,6 +709,8 @@ struct btree_trans_buf { x(stripe_delete) \ x(reflink) \ x(fallocate) \ + x(fsync) \ + x(dio_write) \ x(discard) \ x(discard_fast) \ x(invalidate) \ diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index f49e6c0f0f68..b889370a5088 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -387,6 +387,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); + bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write); + /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); @@ -590,22 +592,25 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) prefetch(&inode->ei_inode); prefetch((void *) &inode->ei_inode + 64); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) + return -EROFS; + inode_lock(&inode->v); ret = generic_write_checks(req, iter); if (unlikely(ret <= 0)) - goto err; + goto err_put_write_ref; ret = file_remove_privs(file); if (unlikely(ret)) - goto err; + goto err_put_write_ref; ret = file_update_time(file); if (unlikely(ret)) - goto err; + goto err_put_write_ref; if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) - goto err; + goto err_put_write_ref; inode_dio_begin(&inode->v); bch2_pagecache_block_get(inode); @@ -645,7 +650,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) } ret = bch2_dio_write_loop(dio); -err: +out: if (locked) inode_unlock(&inode->v); return ret; @@ -653,7 +658,9 @@ err_put_bio: bch2_pagecache_block_put(inode); bio_put(bio); inode_dio_end(&inode->v); - goto err; +err_put_write_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + goto out; } void bch2_fs_fs_io_direct_exit(struct bch_fs *c) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 8c70123b6a0c..20b40477425f 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -174,18 +174,18 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, static int bch2_flush_inode(struct bch_fs *c, struct bch_inode_info *inode) { - struct bch_inode_unpacked u; - int ret; - if (c->opts.journal_flush_disabled) return 0; - ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); - if (ret) - return ret; + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + return -EROFS; - return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: - bch2_inode_flush_nocow_writes(c, inode); + struct bch_inode_unpacked u; + int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: + bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: + bch2_inode_flush_nocow_writes(c, inode); + bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + return ret; } int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) From 9054ef2ea944528b7935d0ce3f540d4dc0bc37ba Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 16:13:13 -0400 Subject: [PATCH 216/270] bcachefs: Run merges at BCH_WATERMARK_btree This fixes a deadlock where the interior update path during journal replay ends up doing a ton of merges on the backpointers btree, and deadlocking. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 29a5bc7d8789..1a7537ee4329 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1926,6 +1926,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); + flags &= ~BCH_WATERMARK_MASK; + b = trans->paths[path].l[level].b; if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || @@ -2071,6 +2073,10 @@ err: bch2_path_put(trans, new_path, true); bch2_path_put(trans, sib_path, true); bch2_trans_verify_locks(trans); + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) + ret = 0; + if (!ret) + ret = bch2_trans_relock(trans); return ret; err_free_update: bch2_btree_node_free_never_used(as, trans, n); From 3f10048973c8366e27147d72bf3fd8d0e1d929f2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 18:39:03 -0400 Subject: [PATCH 217/270] bcachefs: Disable merges from interior update path There's been a bug in the btree write buffer where it wasn't triggering btree node merges - and leaving behind a bunch of nearly empty btree nodes. Then during journal replay, when updates to the backpointers btree aren't using the btree write buffer (because we require synchronization with journal replay), we end up doing those merges all at once. Then if it's the interior update path running them, we deadlock because those run with the highest watermark. There's no real need for the interior update path to be doing btree node merges; other code paths can handle that at lower watermarks. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 1a7537ee4329..82ac00aa0f7b 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1926,6 +1926,16 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); + /* + * Work around a deadlock caused by the btree write buffer not doing + * merges and leaving tons of merges for us to do - we really don't need + * to be doing merges at all from the interior update path, and if the + * interior update path is generating too many new interior updates we + * deadlock: + */ + if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) + return 0; + flags &= ~BCH_WATERMARK_MASK; b = trans->paths[path].l[level].b; From 86dbf8c566417afd62087ecd3bedcf84f91ee4e4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Dec 2023 22:42:34 -0500 Subject: [PATCH 218/270] bcachefs: Fix btree node merging on write buffer btrees The btree write buffer flush fastpath that avoids the main transaction commit path had the unfortunate side effect of not doing btree node merging. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index baf63e2fddb6..36a6f42aba5e 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -316,6 +316,16 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { bch2_btree_node_unlock_write(trans, path, path->l[0].b); write_locked = false; + + ret = lockrestart_do(trans, + bch2_btree_iter_traverse(&iter) ?: + bch2_foreground_maybe_merge(trans, iter.path, 0, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc)); + if (ret) + goto err; } } @@ -382,10 +392,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) ret = commit_do(trans, NULL, NULL, BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim, + BCH_TRANS_COMMIT_no_journal_res , btree_write_buffered_insert(trans, i)); if (ret) goto err; From 16b52bbee4823b01ab7fe3919373c981a38f3797 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 5 Apr 2024 17:56:35 +0300 Subject: [PATCH 219/270] kernfs: annotate different lockdep class for of->mutex of writable files The writable file /sys/power/resume may call vfs lookup helpers for arbitrary paths and readonly files can be read by overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. To avoid a lockdep warning of circular dependency between overlayfs inode lock and kernfs of->mutex, use a different lockdep class for writable and readonly kernfs files. Reported-by: syzbot+9a5b0ced8b1bfb238b56@syzkaller.appspotmail.com Fixes: 0fedefd4c4e3 ("kernfs: sysfs: support custom llseek method for sysfs entries") Suggested-by: Al Viro Signed-off-by: Amir Goldstein Signed-off-by: Al Viro --- fs/kernfs/file.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e9df2f87072c..8502ef68459b 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -636,11 +636,18 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * - * Both paths of the branch look the same. They're supposed to + * For similar reasons, writable and readonly files are given different + * lockdep key, because the writable file /sys/power/resume may call vfs + * lookup helpers for arbitrary paths and readonly files can be read by + * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. + * + * All three cases look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); + else if (file->f_mode & FMODE_WRITE) + mutex_init(&of->mutex); else mutex_init(&of->mutex); From 1382e3b6a3500c245e5278c66d210c02926f804f Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Thu, 11 Apr 2024 08:11:24 +0300 Subject: [PATCH 220/270] net: change maximum number of UDP segments to 128 The commit fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") adds check of potential number of UDP segments vs UDP_MAX_SEGMENTS in linux/virtio_net.h. After this change certification test of USO guest-to-guest transmit on Windows driver for virtio-net device fails, for example with packet size of ~64K and mss of 536 bytes. In general the USO should not be more restrictive than TSO. Indeed, in case of unreasonably small mss a lot of segments can cause queue overflow and packet loss on the destination. Limit of 128 segments is good for any practical purpose, with minimal meaningful mss of 536 the maximal UDP packet will be divided to ~120 segments. The number of segments for UDP packets is validated vs UDP_MAX_SEGMENTS also in udp.c (v4,v6), this does not affect quest-to-guest path but does affect packets sent to host, for example. It is important to mention that UDP_MAX_SEGMENTS is kernel-only define and not available to user mode socket applications. In order to request MSS smaller than MTU the applications just uses setsockopt with SOL_UDP and UDP_SEGMENT and there is no limitations on socket API level. Fixes: fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") Signed-off-by: Yuri Benditovich Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 2 +- tools/testing/selftests/net/udpgso.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 17539d089666..e398e1dbd2d3 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { #define udp_assign_bit(nr, sk, val) \ assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val) -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk) diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 1d975bf52af3..85b3baa3f7f3 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -34,7 +34,7 @@ #endif #ifndef UDP_MAX_SEGMENTS -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #endif #define CONST_MTU_TEST 1500 From 0bbac3facb5d6cc0171c45c9873a2dc96bea9680 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Apr 2024 13:38:39 -0700 Subject: [PATCH 221/270] Linux 6.9-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e1bf12891cb0..59d8a7f95d0a 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* From bceb86be9e970bc6d77bfd1f8e5272d9a2007c16 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 23:59:06 -0400 Subject: [PATCH 222/270] bcachefs: add missing bounds check in __bch2_bkey_val_invalid() Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey_methods.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 5e52684764eb..e4481e4d5f38 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -175,7 +175,10 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", - bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]); + bch2_btree_node_type_str(type), + k.k->type < KEY_TYPE_MAX + ? bch2_bkey_types[k.k->type] + : "(unknown)"); if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { bkey_fsck_err_on(k.k->size == 0, c, err, From d789e9a7d5e2799f4d5425b0b620210d2fcad529 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 23:59:28 -0400 Subject: [PATCH 223/270] bcachefs: Interior known are required to have known key types For forwards compatibilyt, we allow bkeys of unknown type in leaf nodes; we can simply ignore metadata we don't understand. Pointers to btree nodes must always be of known types, howwever. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey_methods.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index e4481e4d5f38..db336a43fc08 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -171,7 +171,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (type >= BKEY_TYPE_NR) return 0; - bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && + bkey_fsck_err_on((type == BKEY_TYPE_btree || + (flags & BKEY_INVALID_COMMIT)) && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", From 8cf2036e7b557282667a437d409e6307d55366ab Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 15:34:14 -0400 Subject: [PATCH 224/270] bcachefs: add safety checks in bch2_btree_node_fill() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index c7f156320a35..c347d08d80bc 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -711,7 +711,30 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct btree *b; u32 seq; - BUG_ON(level + 1 >= BTREE_MAX_DEPTH); + if (unlikely(level >= BTREE_MAX_DEPTH)) { + int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", + level, BTREE_MAX_DEPTH); + return ERR_PTR(ret); + } + + if (unlikely(!bkey_is_btree_ptr(&k->k))) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + + if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + /* * Parent node must be locked, else we could read in a btree node that's * been freed: From e879389f5777de5d94f27f8d88d7e92341afa3ef Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 15:54:33 -0400 Subject: [PATCH 225/270] bcachefs: Fix bch2_btree_node_fill() for !path We shouldn't be doing the unlock/relock dance when we're not using a path - this fixes an assertion pop when called from btree node scan. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 46 ++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index c347d08d80bc..02c70e813fac 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -709,7 +709,6 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - u32 seq; if (unlikely(level >= BTREE_MAX_DEPTH)) { int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", @@ -775,34 +774,26 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } set_btree_node_read_in_flight(b); - six_unlock_write(&b->c.lock); - seq = six_lock_seq(&b->c.lock); - six_unlock_intent(&b->c.lock); - - /* Unlock before doing IO: */ - if (path && sync) - bch2_trans_unlock_noassert(trans); - - bch2_btree_node_read(trans, b, sync); - - if (!sync) - return NULL; if (path) { - int ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - BUG_ON(!trans->restarted); - return ERR_PTR(ret); - } - } + u32 seq = six_lock_seq(&b->c.lock); - if (!six_relock_type(&b->c.lock, lock_type, seq)) { - BUG_ON(!path); + /* Unlock before doing IO: */ + six_unlock_intent(&b->c.lock); + bch2_trans_unlock_noassert(trans); - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); + bch2_btree_node_read(trans, b, sync); + + if (!sync) + return NULL; + + if (!six_relock_type(&b->c.lock, lock_type, seq)) + b = NULL; + } else { + bch2_btree_node_read(trans, b, sync); + if (lock_type == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); } return b; @@ -1135,18 +1126,19 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; - struct btree *b; BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); - b = btree_cache_find(bc, k); + struct btree *b = btree_cache_find(bc, k); if (b) return 0; b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); - return PTR_ERR_OR_ZERO(b); + if (!IS_ERR_OR_NULL(b)) + six_unlock_read(&b->c.lock); + return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b); } void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) From bdae2a7e6020e1cf864a30dab2af323575569dc7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 22:43:11 -0400 Subject: [PATCH 226/270] bcachefs: sysfs internal/trigger_journal_flush Add a sysfs knob for immediately flushing the entire journal. Signed-off-by: Kent Overstreet --- fs/bcachefs/sysfs.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index b18b0cc81b59..5be92fe3f4ea 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -25,6 +25,7 @@ #include "ec.h" #include "inode.h" #include "journal.h" +#include "journal_reclaim.h" #include "keylist.h" #include "move.h" #include "movinggc.h" @@ -138,6 +139,7 @@ do { \ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); +write_attribute(trigger_journal_flush); write_attribute(prune_cache); write_attribute(btree_wakeup); rw_attribute(btree_gc_periodic); @@ -500,7 +502,7 @@ STORE(bch2_fs) /* Debugging: */ - if (!test_bit(BCH_FS_rw, &c->flags)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; if (attr == &sysfs_prune_cache) { @@ -533,6 +535,11 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_invalidates) bch2_do_invalidates(c); + if (attr == &sysfs_trigger_journal_flush) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); + } + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -553,6 +560,7 @@ STORE(bch2_fs) size = ret; } #endif + bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return size; } SYSFS_OPS(bch2_fs); @@ -651,6 +659,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_gc, &sysfs_trigger_discards, &sysfs_trigger_invalidates, + &sysfs_trigger_journal_flush, &sysfs_prune_cache, &sysfs_btree_wakeup, From 27c15ed297cb71c2e7a839439b5a097081a32605 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 18:45:47 -0400 Subject: [PATCH 227/270] bcachefs: bch_member.btree_allocated_bitmap This adds a small (64 bit) per-device bitmap that tracks ranges that have btree nodes, for accelerating btree node scan if it is ever needed. - New helpers, bch2_dev_btree_bitmap_marked() and bch2_dev_bitmap_mark(), for checking and updating the bitmap - Interior btree update path updates the bitmaps when required - The check_allocations pass has a new fsck_err check, btree_bitmap_not_marked - New on disk format version, mi_btree_mitmap, which indicates the new bitmap is present - Upgrade table lists the required recovery pass and expected fsck error - Btree node scan uses the bitmap to skip ranges if we're on the new version Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 7 ++-- fs/bcachefs/btree_gc.c | 13 +++++++ fs/bcachefs/btree_node_scan.c | 9 +++-- fs/bcachefs/btree_update_interior.c | 24 +++++++++++++ fs/bcachefs/sb-downgrade.c | 5 ++- fs/bcachefs/sb-errors_types.h | 3 +- fs/bcachefs/sb-members.c | 53 +++++++++++++++++++++++++++++ fs/bcachefs/sb-members.h | 21 ++++++++++++ fs/bcachefs/super_types.h | 2 ++ 9 files changed, 131 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 9480f1b44f07..085987435a5e 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -578,7 +578,8 @@ struct bch_member { __le64 nbuckets; /* device size */ __le16 first_bucket; /* index of first bucket used */ __le16 bucket_size; /* sectors */ - __le32 pad; + __u8 btree_bitmap_shift; + __u8 pad[3]; __le64 last_mount; /* time_t */ __le64 flags; @@ -587,6 +588,7 @@ struct bch_member { __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; __le64 errors_reset_time; __le64 seq; + __le64 btree_allocated_bitmap; }; #define BCH_MEMBER_V1_BYTES 56 @@ -876,7 +878,8 @@ struct bch_sb_field_downgrade { x(rebalance_work, BCH_VERSION(1, 3)) \ x(member_seq, BCH_VERSION(1, 4)) \ x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ - x(btree_subvolume_children, BCH_VERSION(1, 6)) + x(btree_subvolume_children, BCH_VERSION(1, 6)) \ + x(mi_btree_bitmap, BCH_VERSION(1, 7)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index d2555da55c6d..ecbd9598f69f 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -828,6 +828,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, struct bch_fs *c = trans->c; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + struct printbuf buf = PRINTBUF; int ret = 0; deleted.p = k->k->p; @@ -848,11 +849,23 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (ret) goto err; + if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k), + c, btree_bitmap_not_marked, + "btree ptr not marked in member info btree allocated bitmap\n %s", + (bch2_bkey_val_to_text(&buf, c, *k), + buf.buf))) { + mutex_lock(&c->sb_lock); + bch2_dev_btree_bitmap_mark(c, *k); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + ret = commit_do(trans, NULL, NULL, 0, bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: + printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 20f2b37c4474..866bd278439f 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -205,8 +205,13 @@ static int read_btree_nodes_worker(void *p) last_print = jiffies; } - try_read_btree_node(w->f, ca, bio, buf, - bucket * ca->mi.bucket_size + bucket_offset); + u64 sector = bucket * ca->mi.bucket_size + bucket_offset; + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && + !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) + continue; + + try_read_btree_node(w->f, ca, bio, buf, sector); } err: bio_put(bio); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 82ac00aa0f7b..6030c396754f 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -21,6 +21,7 @@ #include "keylist.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-members.h" #include "super-io.h" #include "trace.h" @@ -605,6 +606,26 @@ static void btree_update_add_key(struct btree_update *as, bch2_keylist_push(keys); } +static bool btree_update_new_nodes_marked_sb(struct btree_update *as) +{ + for_each_keylist_key(&as->new_keys, k) + if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) + return false; + return true; +} + +static void btree_update_new_nodes_mark_sb(struct btree_update *as) +{ + struct bch_fs *c = as->c; + + mutex_lock(&c->sb_lock); + for_each_keylist_key(&as->new_keys, k) + bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + /* * The transactional part of an interior btree node update, where we journal the * update we did to the interior node and update alloc info: @@ -662,6 +683,9 @@ static void btree_update_nodes_written(struct btree_update *as) if (ret) goto err; + if (!btree_update_new_nodes_marked_sb(as)) + btree_update_new_nodes_mark_sb(as); + /* * Wait for any in flight writes to finish before we free the old nodes * on disk: diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index d6f81179c3a2..a98ef940b7a3 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -51,7 +51,10 @@ BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ x(btree_subvolume_children, \ BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ - BCH_FSCK_ERR_subvol_children_not_set) + BCH_FSCK_ERR_subvol_children_not_set) \ + x(mi_btree_bitmap, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_btree_bitmap_not_marked) #define DOWNGRADE_TABLE() diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index d7d609131030..5b600c6d7aca 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -270,7 +270,8 @@ x(btree_ptr_v2_min_key_bad, 262) \ x(btree_root_unreadable_and_scan_found_nothing, 263) \ x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) + x(dup_backpointer_to_bad_csum_extent, 265) \ + x(btree_bitmap_not_marked, 266) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index eff5ce18c69c..522a969345e5 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "disk_groups.h" #include "opts.h" #include "replicas.h" @@ -426,3 +427,55 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bch2_write_super(c); mutex_unlock(&c->sb_lock); } + +/* + * Per member "range has btree nodes" bitmap: + * + * This is so that if we ever have to run the btree node scan to repair we don't + * have to scan full devices: + */ + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) +{ + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev), + ptr->offset, btree_sectors(c))) + return false; + return true; +} + +static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, + u64 start, unsigned sectors) +{ + struct bch_member *m = __bch2_members_v2_get_mut(mi, dev); + u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap); + + u64 end = start + sectors; + + int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6); + if (resize > 0) { + u64 new_bitmap = 0; + + for (unsigned i = 0; i < 64; i++) + if (bitmap & BIT_ULL(i)) + new_bitmap |= BIT_ULL(i >> resize); + bitmap = new_bitmap; + m->btree_bitmap_shift += resize; + } + + for (unsigned bit = sectors >> m->btree_bitmap_shift; + bit << m->btree_bitmap_shift < end; + bit++) + bitmap |= BIT_ULL(bit); + + m->btree_allocated_bitmap = cpu_to_le64(bitmap); +} + +void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index be0a94183271..b27c3e4467cf 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -3,6 +3,7 @@ #define _BCACHEFS_SB_MEMBERS_H #include "darray.h" +#include "bkey_types.h" extern char * const bch2_member_error_strs[]; @@ -220,6 +221,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), .valid = bch2_member_exists(mi), + .btree_bitmap_shift = mi->btree_bitmap_shift, + .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), }; } @@ -228,4 +231,22 @@ void bch2_sb_members_from_cpu(struct bch_fs *); void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); void bch2_dev_errors_reset(struct bch_dev *); +static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors) +{ + u64 end = start + sectors; + + if (end > 64 << ca->mi.btree_bitmap_shift) + return false; + + for (unsigned bit = sectors >> ca->mi.btree_bitmap_shift; + bit << ca->mi.btree_bitmap_shift < end; + bit++) + if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit))) + return false; + return true; +} + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); +void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); + #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index ec784d975f66..11bcef170c2c 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -37,6 +37,8 @@ struct bch_member_cpu { u8 durability; u8 freespace_initialized; u8 valid; + u8 btree_bitmap_shift; + u64 btree_allocated_bitmap; }; #endif /* _BCACHEFS_SUPER_TYPES_H */ From f0a73d4fde5b285d94a702026216d9fd1fd2733d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 14 Apr 2024 00:51:48 -0400 Subject: [PATCH 228/270] bcachefs: Check for backpointer bucket_offset >= bucket size Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 8 +++++--- fs/bcachefs/backpointers.h | 9 +++------ fs/bcachefs/sb-errors_types.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 114328acde72..fadb1078903d 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -49,13 +49,15 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, if (!bch2_dev_exists2(c, bp.k->p.inode)) return 0; + struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode); struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); int ret = 0; - bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), + bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || + !bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), c, err, - backpointer_pos_wrong, - "backpointer at wrong pos"); + backpointer_bucket_offset_wrong, + "backpointer bucket_offset wrong"); fsck_err: return ret; } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index da012ca7daee..85949b9fd880 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -53,14 +53,11 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, u64 bucket_offset) { struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - struct bpos ret; - - ret = POS(bucket.inode, - (bucket_to_sector(ca, bucket.offset) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); + struct bpos ret = POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); - return ret; } diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 5b600c6d7aca..4ca6e7b0d8aa 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -130,7 +130,7 @@ x(bucket_gens_nonzero_for_invalid_buckets, 122) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ x(need_discard_freespace_key_bad, 124) \ - x(backpointer_pos_wrong, 125) \ + x(backpointer_bucket_offset_wrong, 125) \ x(backpointer_to_missing_device, 126) \ x(backpointer_to_missing_alloc, 127) \ x(backpointer_to_missing_ptr, 128) \ From 460b0d33cf10eee33de651381d3170ef13241650 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Apr 2024 11:02:02 -0700 Subject: [PATCH 229/270] inet: bring NLM_DONE out to a separate recv() again Commit under Fixes optimized the number of recv() calls needed during RTM_GETROUTE dumps, but we got multiple reports of applications hanging on recv() calls. Applications expect that a route dump will be terminated with a recv() reading an individual NLM_DONE message. Coalescing NLM_DONE is perfectly legal in netlink, but even tho reporters fixed the code in respective projects, chances are it will take time for those applications to get updated. So revert to old behavior (for now)? Old kernel (5.19): $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 692 bytes, 11 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 Recv: read 20 bytes, 1 messages nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 Before (6.9-rc2): $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 712 bytes, 12 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 After: $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 692 bytes, 11 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 Recv: read 20 bytes, 1 messages nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 Reported-by: Stefano Brivio Link: https://lore.kernel.org/all/20240315124808.033ff58d@elisabeth Reported-by: Ilya Maximets Link: https://lore.kernel.org/all/02b50aae-f0e9-47a4-8365-a977a85975d3@ovn.org Fixes: 4ce5dc9316de ("inet: switch inet_dump_fib() to RCU protection") Signed-off-by: Jakub Kicinski Tested-by: Ilya Maximets Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 48741352a88a..c484b1c0fc00 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1050,6 +1050,11 @@ next: e++; } } + + /* Don't let NLM_DONE coalesce into a message, even if it could. + * Some user space expects NLM_DONE in a separate recv(). + */ + err = skb->len; out: cb->args[1] = e; From 75ce9506ee3dc66648a7d74ab3b0acfa364d6d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Fri, 12 Apr 2024 12:02:56 +0000 Subject: [PATCH 230/270] octeontx2-pf: fix FLOW_DIS_IS_FRAGMENT implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upon reviewing the flower control flags handling in this driver, I notice that the key wasn't being used, only the mask. Ie. `tc flower ... ip_flags nofrag` was hardware offloaded as `... ip_flags frag`. Only compile tested, no access to HW. Fixes: c672e3727989 ("octeontx2-pf: Add support to filter packet based on IP fragment") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c index 87bdb93cb066..f4655a8c0705 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c @@ -689,6 +689,7 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) { struct flow_match_control match; + u32 val; flow_rule_match_control(rule, &match); if (match.mask->flags & FLOW_DIS_FIRST_FRAG) { @@ -697,12 +698,14 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, } if (match.mask->flags & FLOW_DIS_IS_FRAGMENT) { + val = match.key->flags & FLOW_DIS_IS_FRAGMENT; if (ntohs(flow_spec->etype) == ETH_P_IP) { - flow_spec->ip_flag = IPV4_FLAG_MORE; + flow_spec->ip_flag = val ? IPV4_FLAG_MORE : 0; flow_mask->ip_flag = IPV4_FLAG_MORE; req->features |= BIT_ULL(NPC_IPFRAG_IPV4); } else if (ntohs(flow_spec->etype) == ETH_P_IPV6) { - flow_spec->next_header = IPPROTO_FRAGMENT; + flow_spec->next_header = val ? + IPPROTO_FRAGMENT : 0; flow_mask->next_header = 0xff; req->features |= BIT_ULL(NPC_IPFRAG_IPV6); } else { From a2ac1cbc5397eb4e400efa66c3337886d9a63026 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Mon, 15 Apr 2024 13:10:51 +0530 Subject: [PATCH 231/270] pwm: dwc: allow suspend/resume for 16 channels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With 16 channel pwm support, we're registering two instances of pwm_chip with 8 channels each. We need to update PM functions to use both instances of pwm_chip during power state transitions. Introduce struct dwc_pwm_drvdata and use it as driver_data, which will maintain both instances of pwm_chip along with dwc_pwm_info and allow us to use them inside suspend/resume handles. Fixes: ebf2c89eb95e ("pwm: dwc: Add 16 channel support for Intel Elkhart Lake") Signed-off-by: Raag Jadav Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240415074051.14681-1-raag.jadav@intel.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-dwc-core.c | 1 - drivers/pwm/pwm-dwc.c | 82 +++++++++++++++++++++++++------------- drivers/pwm/pwm-dwc.h | 6 +++ 3 files changed, 60 insertions(+), 29 deletions(-) diff --git a/drivers/pwm/pwm-dwc-core.c b/drivers/pwm/pwm-dwc-core.c index 043736972cb9..c8425493b95d 100644 --- a/drivers/pwm/pwm-dwc-core.c +++ b/drivers/pwm/pwm-dwc-core.c @@ -172,7 +172,6 @@ struct pwm_chip *dwc_pwm_alloc(struct device *dev) dwc->clk_ns = 10; chip->ops = &dwc_pwm_ops; - dev_set_drvdata(dev, chip); return chip; } EXPORT_SYMBOL_GPL(dwc_pwm_alloc); diff --git a/drivers/pwm/pwm-dwc.c b/drivers/pwm/pwm-dwc.c index 676eaf8d7a53..fb3eadf6fbc4 100644 --- a/drivers/pwm/pwm-dwc.c +++ b/drivers/pwm/pwm-dwc.c @@ -31,26 +31,34 @@ static const struct dwc_pwm_info ehl_pwm_info = { .size = 0x1000, }; -static int dwc_pwm_init_one(struct device *dev, void __iomem *base, unsigned int offset) +static int dwc_pwm_init_one(struct device *dev, struct dwc_pwm_drvdata *ddata, unsigned int idx) { struct pwm_chip *chip; struct dwc_pwm *dwc; + int ret; chip = dwc_pwm_alloc(dev); if (IS_ERR(chip)) return PTR_ERR(chip); dwc = to_dwc_pwm(chip); - dwc->base = base + offset; + dwc->base = ddata->io_base + (ddata->info->size * idx); - return devm_pwmchip_add(dev, chip); + ret = devm_pwmchip_add(dev, chip); + if (ret) + return ret; + + ddata->chips[idx] = chip; + return 0; } static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id) { const struct dwc_pwm_info *info; struct device *dev = &pci->dev; - int i, ret; + struct dwc_pwm_drvdata *ddata; + unsigned int idx; + int ret; ret = pcim_enable_device(pci); if (ret) @@ -63,17 +71,25 @@ static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id) return dev_err_probe(dev, ret, "Failed to iomap PCI BAR\n"); info = (const struct dwc_pwm_info *)id->driver_data; + ddata = devm_kzalloc(dev, struct_size(ddata, chips, info->nr), GFP_KERNEL); + if (!ddata) + return -ENOMEM; - for (i = 0; i < info->nr; i++) { - /* - * No need to check for pcim_iomap_table() failure, - * pcim_iomap_regions() already does it for us. - */ - ret = dwc_pwm_init_one(dev, pcim_iomap_table(pci)[0], i * info->size); + /* + * No need to check for pcim_iomap_table() failure, + * pcim_iomap_regions() already does it for us. + */ + ddata->io_base = pcim_iomap_table(pci)[0]; + ddata->info = info; + + for (idx = 0; idx < ddata->info->nr; idx++) { + ret = dwc_pwm_init_one(dev, ddata, idx); if (ret) return ret; } + dev_set_drvdata(dev, ddata); + pm_runtime_put(dev); pm_runtime_allow(dev); @@ -88,19 +104,24 @@ static void dwc_pwm_remove(struct pci_dev *pci) static int dwc_pwm_suspend(struct device *dev) { - struct pwm_chip *chip = dev_get_drvdata(dev); - struct dwc_pwm *dwc = to_dwc_pwm(chip); - int i; + struct dwc_pwm_drvdata *ddata = dev_get_drvdata(dev); + unsigned int idx; - for (i = 0; i < DWC_TIMERS_TOTAL; i++) { - if (chip->pwms[i].state.enabled) { - dev_err(dev, "PWM %u in use by consumer (%s)\n", - i, chip->pwms[i].label); - return -EBUSY; + for (idx = 0; idx < ddata->info->nr; idx++) { + struct pwm_chip *chip = ddata->chips[idx]; + struct dwc_pwm *dwc = to_dwc_pwm(chip); + unsigned int i; + + for (i = 0; i < DWC_TIMERS_TOTAL; i++) { + if (chip->pwms[i].state.enabled) { + dev_err(dev, "PWM %u in use by consumer (%s)\n", + i, chip->pwms[i].label); + return -EBUSY; + } + dwc->ctx[i].cnt = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT(i)); + dwc->ctx[i].cnt2 = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT2(i)); + dwc->ctx[i].ctrl = dwc_pwm_readl(dwc, DWC_TIM_CTRL(i)); } - dwc->ctx[i].cnt = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT(i)); - dwc->ctx[i].cnt2 = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT2(i)); - dwc->ctx[i].ctrl = dwc_pwm_readl(dwc, DWC_TIM_CTRL(i)); } return 0; @@ -108,14 +129,19 @@ static int dwc_pwm_suspend(struct device *dev) static int dwc_pwm_resume(struct device *dev) { - struct pwm_chip *chip = dev_get_drvdata(dev); - struct dwc_pwm *dwc = to_dwc_pwm(chip); - int i; + struct dwc_pwm_drvdata *ddata = dev_get_drvdata(dev); + unsigned int idx; - for (i = 0; i < DWC_TIMERS_TOTAL; i++) { - dwc_pwm_writel(dwc, dwc->ctx[i].cnt, DWC_TIM_LD_CNT(i)); - dwc_pwm_writel(dwc, dwc->ctx[i].cnt2, DWC_TIM_LD_CNT2(i)); - dwc_pwm_writel(dwc, dwc->ctx[i].ctrl, DWC_TIM_CTRL(i)); + for (idx = 0; idx < ddata->info->nr; idx++) { + struct pwm_chip *chip = ddata->chips[idx]; + struct dwc_pwm *dwc = to_dwc_pwm(chip); + unsigned int i; + + for (i = 0; i < DWC_TIMERS_TOTAL; i++) { + dwc_pwm_writel(dwc, dwc->ctx[i].cnt, DWC_TIM_LD_CNT(i)); + dwc_pwm_writel(dwc, dwc->ctx[i].cnt2, DWC_TIM_LD_CNT2(i)); + dwc_pwm_writel(dwc, dwc->ctx[i].ctrl, DWC_TIM_CTRL(i)); + } } return 0; diff --git a/drivers/pwm/pwm-dwc.h b/drivers/pwm/pwm-dwc.h index a8b074841ae8..c6e2df5a6122 100644 --- a/drivers/pwm/pwm-dwc.h +++ b/drivers/pwm/pwm-dwc.h @@ -38,6 +38,12 @@ struct dwc_pwm_info { unsigned int size; }; +struct dwc_pwm_drvdata { + const struct dwc_pwm_info *info; + void __iomem *io_base; + struct pwm_chip *chips[]; +}; + struct dwc_pwm_ctx { u32 cnt; u32 cnt2; From fb7c3d8ba039df877886fd457538d8b24ca9c84b Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Thu, 4 Apr 2024 10:18:08 +0200 Subject: [PATCH 232/270] dt-bindings: pwm: mediatek,pwm-disp: Document power-domains property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow the power-domains property to the PWM_DISP block as on some SoCs this does need at most one power domain. Fixes: b09b179bac0a ("dt-bindings: pwm: Convert pwm-mtk-disp.txt to mediatek,pwm-disp.yaml format") Signed-off-by: AngeloGioacchino Del Regno Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240404081808.92199-1-angelogioacchino.delregno@collabora.com Signed-off-by: Uwe Kleine-König --- Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml index afcdeed4e88a..bc813fe74fab 100644 --- a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml +++ b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml @@ -52,6 +52,9 @@ properties: - const: main - const: mm + power-domains: + maxItems: 1 + required: - compatible - reg From 3078e059a5e984663c5c2b04485375c84c2700f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 12 Apr 2024 14:36:38 +0800 Subject: [PATCH 233/270] bcachefs: fix error path of __bch2_read_super() In __bch2_read_super(), if kstrdup() fails, it needs to release memory in sb->holder, fix to call bch2_free_super() in the error path. Signed-off-by: Chao Yu Reviewed-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index e0aa3655b63b..648986a7dbde 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -700,8 +700,11 @@ retry: return -ENOMEM; sb->sb_name = kstrdup(path, GFP_KERNEL); - if (!sb->sb_name) - return -ENOMEM; + if (!sb->sb_name) { + ret = -ENOMEM; + prt_printf(&err, "error allocating memory for sb_name"); + goto err; + } #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) From ad29cf999a9161e7849aa229d2028854f90728c2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 18:02:15 -0400 Subject: [PATCH 234/270] bcachefs: set_btree_iter_dontneed also clears should_be_locked This is part of a larger series cleaning up the semantics of should_be_locked and adding assertions around it; if we don't need an iterator/path anymore, it clearly doesn't need to be locked. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 1d58d447b386..1c70836dd7cc 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -498,8 +498,13 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - if (!trans->restarted) - btree_iter_path(trans, iter)->preserve = false; + if (!iter->path || trans->restarted) + return; + + struct btree_path *path = btree_iter_path(trans, iter); + path->preserve = false; + if (path->ref == 1) + path->should_be_locked = false; } void *__bch2_trans_kmalloc(struct btree_trans *, size_t); From 4225dfa4535f219b03ae14147d9c6e7e82ec8df4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:52 +0100 Subject: [PATCH 235/270] selftests/tcp_ao: Make RST tests less flaky Currently, "active reset" cases are flaky, because select() is called for 3 sockets, while only 2 are expected to receive RST. The idea of the third socket was to get into request_sock_queue, but the test mistakenly attempted to connect() after the listener socket was shut down. Repair this test, it's important to check the different kernel code-paths for signing RST TCP-AO segments. Fixes: c6df7b2361d7 ("selftests/net: Add TCP-AO RST test") Reported-by: Jakub Kicinski Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/rst.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c index 7df8b8700e39..a2fe88d35ac0 100644 --- a/tools/testing/selftests/net/tcp_ao/rst.c +++ b/tools/testing/selftests/net/tcp_ao/rst.c @@ -256,8 +256,6 @@ static int test_wait_fds(int sk[], size_t nr, bool is_writable[], static void test_client_active_rst(unsigned int port) { - /* one in queue, another accept()ed */ - unsigned int wait_for = backlog + 2; int i, sk[3], err; bool is_writable[ARRAY_SIZE(sk)] = {false}; unsigned int last = ARRAY_SIZE(sk) - 1; @@ -275,16 +273,20 @@ static void test_client_active_rst(unsigned int port) for (i = 0; i < last; i++) { err = _test_connect_socket(sk[i], this_ip_dest, port, (i == 0) ? TEST_TIMEOUT_SEC : -1); - if (err < 0) test_error("failed to connect()"); } - synchronize_threads(); /* 2: connection accept()ed, another queued */ - err = test_wait_fds(sk, last, is_writable, wait_for, TEST_TIMEOUT_SEC); + synchronize_threads(); /* 2: two connections: one accept()ed, another queued */ + err = test_wait_fds(sk, last, is_writable, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("test_wait_fds(): %d", err); + /* async connect() with third sk to get into request_sock_queue */ + err = _test_connect_socket(sk[last], this_ip_dest, port, -1); + if (err < 0) + test_error("failed to connect()"); + synchronize_threads(); /* 3: close listen socket */ if (test_client_verify(sk[0], packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC)) test_fail("Failed to send data on connected socket"); @@ -292,13 +294,14 @@ static void test_client_active_rst(unsigned int port) test_ok("Verified established tcp connection"); synchronize_threads(); /* 4: finishing up */ - err = _test_connect_socket(sk[last], this_ip_dest, port, -1); - if (err < 0) - test_error("failed to connect()"); synchronize_threads(); /* 5: closed active sk */ - err = test_wait_fds(sk, ARRAY_SIZE(sk), NULL, - wait_for, TEST_TIMEOUT_SEC); + /* + * Wait for 2 connections: one accepted, another in the accept queue, + * the one in request_sock_queue won't get fully established, so + * doesn't receive an active RST, see inet_csk_listen_stop(). + */ + err = test_wait_fds(sk, last, NULL, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("select(): %d", err); From b089b3bead532419cdcbd8e4e0a3e23c49d11573 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:53 +0100 Subject: [PATCH 236/270] selftests/tcp_ao: Zero-init tcp_ao_info_opt The structure is on the stack and has to be zero-initialized as the kernel checks for: > if (in.reserved != 0 || in.reserved2 != 0) > return -EINVAL; Fixes: b26660531cf6 ("selftests/net: Add test for TCP-AO add setsockopt() command") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/setsockopt-closed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c index 452de131fa3a..517930f9721b 100644 --- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c +++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c @@ -21,7 +21,7 @@ static void make_listen(int sk) static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info, const char *tst) { - struct tcp_ao_info_opt tmp; + struct tcp_ao_info_opt tmp = {}; socklen_t len = sizeof(tmp); if (getsockopt(sk, IPPROTO_TCP, TCP_AO_INFO, &tmp, &len)) From beb78cd1329d039d73487ca05633d1b92e1ab2ea Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:54 +0100 Subject: [PATCH 237/270] selftests/tcp_ao: Fix fscanf() call for format-security MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my new laptop with packages from nixos-unstable, gcc 12.3.0 produces: > lib/proc.c: In function ‘netstat_read_type’: > lib/proc.c:89:9: error: format not a string literal and no format arguments [-Werror=format-security] > 89 | if (fscanf(fnetstat, type->header_name) == EOF) > | ^~ > cc1: some warnings being treated as errors Here the selftests lib parses header name, while expectes non-space word ending with a column. Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reported-by: Muhammad Usama Anjum Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tcp_ao/lib/proc.c b/tools/testing/selftests/net/tcp_ao/lib/proc.c index 2fb6dd8adba6..8b984fa04286 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/proc.c +++ b/tools/testing/selftests/net/tcp_ao/lib/proc.c @@ -86,7 +86,7 @@ static void netstat_read_type(FILE *fnetstat, struct netstat **dest, char *line) pos = strchr(line, ' ') + 1; - if (fscanf(fnetstat, type->header_name) == EOF) + if (fscanf(fnetstat, "%[^ :]", type->header_name) == EOF) test_error("fscanf(%s)", type->header_name); if (fread(&tmp, 1, 1, fnetstat) != 1 || tmp != ':') test_error("Unexpected netstat format (%c)", tmp); From b476c93654d748c13624f7c7d0ba191c56a8092e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:55 +0100 Subject: [PATCH 238/270] selftests/tcp_ao: Printing fixes to confirm with format-security MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my new laptop with packages from nixos-unstable, gcc 12.3.0 produces > lib/setup.c: In function ‘__test_msg’: > lib/setup.c:20:9: error: format not a string literal and no format arguments [-Werror=format-security] > 20 | ksft_print_msg(buf); > | ^~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_ok’: > lib/setup.c:26:9: error: format not a string literal and no format arguments [-Werror=format-security] > 26 | ksft_test_result_pass(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_fail’: > lib/setup.c:32:9: error: format not a string literal and no format arguments [-Werror=format-security] > 32 | ksft_test_result_fail(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_xfail’: > lib/setup.c:38:9: error: format not a string literal and no format arguments [-Werror=format-security] > 38 | ksft_test_result_xfail(buf); > | ^~~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_error’: > lib/setup.c:44:9: error: format not a string literal and no format arguments [-Werror=format-security] > 44 | ksft_test_result_error(buf); > | ^~~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_skip’: > lib/setup.c:50:9: error: format not a string literal and no format arguments [-Werror=format-security] > 50 | ksft_test_result_skip(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > cc1: some warnings being treated as errors As the buffer was already pre-printed into, print it as a string rather than a format-string. Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reported-by: Muhammad Usama Anjum Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/setup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index 92276f916f2f..e408b9243b2c 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -17,37 +17,37 @@ static pthread_mutex_t ksft_print_lock = PTHREAD_MUTEX_INITIALIZER; void __test_msg(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_print_msg(buf); + ksft_print_msg("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_ok(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_pass(buf); + ksft_test_result_pass("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_fail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_fail(buf); + ksft_test_result_fail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_xfail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_xfail(buf); + ksft_test_result_xfail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_error(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_error(buf); + ksft_test_result_error("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_skip(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_skip(buf); + ksft_test_result_skip("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } From 03cea821b82cbf0ee4a285f9dca6cc1d660dbef3 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:09:54 -0500 Subject: [PATCH 239/270] platform/x86/amd: pmf: Decrease error message to debug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASUS ROG Zephyrus G14 doesn't have _CRS in AMDI0102 device and so there are no resources to walk. This is expected behavior because it doesn't support Smart PC. Decrease error message to debug. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218685 Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240410140956.385-1-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/pmf/acpi.c b/drivers/platform/x86/amd/pmf/acpi.c index d0cf46e2fc8e..60fc71c9fb0f 100644 --- a/drivers/platform/x86/amd/pmf/acpi.c +++ b/drivers/platform/x86/amd/pmf/acpi.c @@ -437,7 +437,7 @@ int apmf_check_smart_pc(struct amd_pmf_dev *pmf_dev) status = acpi_walk_resources(ahandle, METHOD_NAME__CRS, apmf_walk_resources, pmf_dev); if (ACPI_FAILURE(status)) { - dev_err(pmf_dev->dev, "acpi_walk_resources failed :%d\n", status); + dev_dbg(pmf_dev->dev, "acpi_walk_resources failed :%d\n", status); return -EINVAL; } From ed13f622bcd594d6cefd6239b1722ed8b84ba98f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:09:55 -0500 Subject: [PATCH 240/270] platform/x86/amd: pmf: Add infrastructure for quirking supported funcs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the event of a BIOS bug add infrastructure that will be utilized to override the return value for supported_funcs to avoid enabling features. Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240410140956.385-2-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/Makefile | 2 +- drivers/platform/x86/amd/pmf/acpi.c | 5 ++- drivers/platform/x86/amd/pmf/core.c | 1 + drivers/platform/x86/amd/pmf/pmf-quirks.c | 43 +++++++++++++++++++++++ drivers/platform/x86/amd/pmf/pmf.h | 3 ++ 5 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 drivers/platform/x86/amd/pmf/pmf-quirks.c diff --git a/drivers/platform/x86/amd/pmf/Makefile b/drivers/platform/x86/amd/pmf/Makefile index 6b26e48ce8ad..7d6079b02589 100644 --- a/drivers/platform/x86/amd/pmf/Makefile +++ b/drivers/platform/x86/amd/pmf/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_AMD_PMF) += amd-pmf.o amd-pmf-objs := core.o acpi.o sps.o \ auto-mode.o cnqf.o \ - tee-if.o spc.o + tee-if.o spc.o pmf-quirks.o diff --git a/drivers/platform/x86/amd/pmf/acpi.c b/drivers/platform/x86/amd/pmf/acpi.c index 60fc71c9fb0f..1157ec148880 100644 --- a/drivers/platform/x86/amd/pmf/acpi.c +++ b/drivers/platform/x86/amd/pmf/acpi.c @@ -343,7 +343,10 @@ static int apmf_if_verify_interface(struct amd_pmf_dev *pdev) if (err) return err; - pdev->supported_func = output.supported_functions; + /* only set if not already set by a quirk */ + if (!pdev->supported_func) + pdev->supported_func = output.supported_functions; + dev_dbg(pdev->dev, "supported functions:0x%x notifications:0x%x version:%u\n", output.supported_functions, output.notification_mask, output.version); diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 5d4f80698a8b..64e6e34a2a9a 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -445,6 +445,7 @@ static int amd_pmf_probe(struct platform_device *pdev) mutex_init(&dev->lock); mutex_init(&dev->update_mutex); + amd_pmf_quirks_init(dev); apmf_acpi_init(dev); platform_set_drvdata(pdev, dev); amd_pmf_dbgfs_register(dev); diff --git a/drivers/platform/x86/amd/pmf/pmf-quirks.c b/drivers/platform/x86/amd/pmf/pmf-quirks.c new file mode 100644 index 000000000000..9f3790eaaa30 --- /dev/null +++ b/drivers/platform/x86/amd/pmf/pmf-quirks.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Platform Management Framework Driver Quirks + * + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Mario Limonciello + */ + +#include + +#include "pmf.h" + +struct quirk_entry { + u32 supported_func; +}; + +static struct quirk_entry quirk_no_sps_bug = { + .supported_func = 0x4003, +}; + +static const struct dmi_system_id fwbug_list[] = { + {} +}; + +void amd_pmf_quirks_init(struct amd_pmf_dev *dev) +{ + const struct dmi_system_id *dmi_id; + struct quirk_entry *quirks; + + dmi_id = dmi_first_match(fwbug_list); + if (!dmi_id) + return; + + quirks = dmi_id->driver_data; + if (quirks->supported_func) { + dev->supported_func = quirks->supported_func; + pr_info("Using supported funcs quirk to avoid %s platform firmware bug\n", + dmi_id->ident); + } +} + diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 8c4df5753f40..eeedd0c0395a 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -720,4 +720,7 @@ int apmf_check_smart_pc(struct amd_pmf_dev *pmf_dev); void amd_pmf_populate_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in); void amd_pmf_dump_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in); +/* Quirk infrastructure */ +void amd_pmf_quirks_init(struct amd_pmf_dev *dev); + #endif /* PMF_H */ From 9d893061ed68820de24b572d1e193b5e4737f2e0 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:09:56 -0500 Subject: [PATCH 241/270] platform/x86/amd: pmf: Add quirk for ROG Zephyrus G14 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ROG Zephyrus G14 advertises support for SPS notifications to the BIOS but doesn't actually use them. Instead the asus-nb-wmi driver utilizes such events. Add a quirk to prevent the system from registering for ACPI platform profile when this system is found to avoid conflicts. Reported-by: al0uette@outlook.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218685 Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240410140956.385-3-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/pmf-quirks.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/platform/x86/amd/pmf/pmf-quirks.c b/drivers/platform/x86/amd/pmf/pmf-quirks.c index 9f3790eaaa30..0b2eb0ae85fe 100644 --- a/drivers/platform/x86/amd/pmf/pmf-quirks.c +++ b/drivers/platform/x86/amd/pmf/pmf-quirks.c @@ -21,6 +21,14 @@ static struct quirk_entry quirk_no_sps_bug = { }; static const struct dmi_system_id fwbug_list[] = { + { + .ident = "ROG Zephyrus G14", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "GA403UV"), + }, + .driver_data = &quirk_no_sps_bug, + }, {} }; From d8c2d38c4d1dee8fe8e015b9ebf65bdd8e4da99b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 15 Apr 2024 14:28:53 -0700 Subject: [PATCH 242/270] platform/x86: ISST: Add Granite Rapids-D to HPM CPU list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Granite Rapids-D to hpm_cpu_ids, so that MSR 0x54 can be used. Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20240415212853.2820470-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/speed_select_if/isst_if_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c index 08df9494603c..30951f7131cd 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c @@ -719,6 +719,7 @@ static struct miscdevice isst_if_char_driver = { }; static const struct x86_cpu_id hpm_cpu_ids[] = { + X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, NULL), X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, NULL), X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, NULL), {} From bc774d46b41482534c7ba92f6342ca0a355c13af Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 15 Apr 2024 15:06:25 -0700 Subject: [PATCH 243/270] platform/x86/intel-uncore-freq: Increase minor number support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No new changes will be added for minor version 2. Change the minor version number to 2 and stop displaying log message for unsupported minor version 2. Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20240415220625.2828339-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../x86/intel/uncore-frequency/uncore-frequency-tpmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c index bd75d61ff8a6..ef730200a04b 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c @@ -29,7 +29,7 @@ #include "uncore-frequency-common.h" #define UNCORE_MAJOR_VERSION 0 -#define UNCORE_MINOR_VERSION 1 +#define UNCORE_MINOR_VERSION 2 #define UNCORE_HEADER_INDEX 0 #define UNCORE_FABRIC_CLUSTER_OFFSET 8 @@ -329,7 +329,7 @@ static int uncore_probe(struct auxiliary_device *auxdev, const struct auxiliary_ goto remove_clusters; } - if (TPMI_MINOR_VERSION(pd_info->ufs_header_ver) != UNCORE_MINOR_VERSION) + if (TPMI_MINOR_VERSION(pd_info->ufs_header_ver) > UNCORE_MINOR_VERSION) dev_info(&auxdev->dev, "Uncore: Ignore: Unsupported minor version:%lx\n", TPMI_MINOR_VERSION(pd_info->ufs_header_ver)); From 0ebd96f5da4410c0cb8fc75e44f1009530b2f90b Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:14 +0300 Subject: [PATCH 244/270] net: stmmac: Apply half-duplex-less constraint for DW QoS Eth only There are three DW MAC IP-cores which can have the multiple Tx/Rx queues enabled: DW GMAC v3.7+ with AV feature, DW QoS Eth v4.x/v5.x, DW XGMAC/XLGMAC Based on the respective HW databooks, only the DW QoS Eth IP-core doesn't support the half-duplex link mode in case if more than one queues enabled: "In multiple queue/channel configurations, for half-duplex operation, enable only the Q0/CH0 on Tx and Rx. For single queue/channel in full-duplex operation, any queue/channel can be enabled." The rest of the IP-cores don't have such constraint. Thus in order to have the constraint applied for the DW QoS Eth MACs only, let's move the it' implementation to the respective MAC-capabilities getter and make sure the getter is called in the queues re-init procedure. Fixes: b6cfffa7ad92 ("stmmac: fix DMA channel hang in half-duplex mode") Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- .../net/ethernet/stmicro/stmmac/dwmac4_core.c | 7 +++++++ .../net/ethernet/stmicro/stmmac/stmmac_main.c | 19 +++---------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index cef25efbdff9..ec6a13e644b3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -71,6 +71,13 @@ static void dwmac4_core_init(struct mac_device_info *hw, static void dwmac4_phylink_get_caps(struct stmmac_priv *priv) { priv->phylink_config.mac_capabilities |= MAC_2500FD; + + if (priv->plat->tx_queues_to_use > 1) + priv->phylink_config.mac_capabilities &= + ~(MAC_10HD | MAC_100HD | MAC_1000HD); + else + priv->phylink_config.mac_capabilities |= + (MAC_10HD | MAC_100HD | MAC_1000HD); } static void dwmac4_rx_queue_enable(struct mac_device_info *hw, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 24cd80490d19..dd58c21b53ee 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1198,17 +1198,6 @@ static int stmmac_init_phy(struct net_device *dev) return ret; } -static void stmmac_set_half_duplex(struct stmmac_priv *priv) -{ - /* Half-Duplex can only work with single tx queue */ - if (priv->plat->tx_queues_to_use > 1) - priv->phylink_config.mac_capabilities &= - ~(MAC_10HD | MAC_100HD | MAC_1000HD); - else - priv->phylink_config.mac_capabilities |= - (MAC_10HD | MAC_100HD | MAC_1000HD); -} - static int stmmac_phy_setup(struct stmmac_priv *priv) { struct stmmac_mdio_bus_data *mdio_bus_data; @@ -1237,10 +1226,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) priv->phylink_config.supported_interfaces); priv->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | - MAC_10FD | MAC_100FD | - MAC_1000FD; - - stmmac_set_half_duplex(priv); + MAC_10 | MAC_100 | MAC_1000; /* Get the MAC specific capabilities */ stmmac_mac_phylink_get_caps(priv); @@ -7355,7 +7341,8 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) priv->rss.table[i] = ethtool_rxfh_indir_default(i, rx_cnt); - stmmac_set_half_duplex(priv); + stmmac_mac_phylink_get_caps(priv); + stmmac_napi_add(dev); if (netif_running(dev)) From 59c3d6ca6cbded6c6599e975b42a9d6a27fcbaf2 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:15 +0300 Subject: [PATCH 245/270] net: stmmac: Fix max-speed being ignored on queue re-init It's possible to have the maximum link speed being artificially limited on the platform-specific basis. It's done either by setting up the plat_stmmacenet_data::max_speed field or by specifying the "max-speed" DT-property. In such cases it's required that any specific MAC-capabilities re-initializations would take the limit into account. In particular the link speed capabilities may change during the number of active Tx/Rx queues re-initialization. But the currently implemented procedure doesn't take the speed limit into account. Fix that by calling phylink_limit_mac_speed() in the stmmac_reinit_queues() method if the speed limitation was required in the same way as it's done in the stmmac_phy_setup() function. Fixes: 95201f36f395 ("net: stmmac: update MAC capabilities when tx queues are updated") Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index dd58c21b53ee..b8a1f02398ee 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7328,6 +7328,7 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) { struct stmmac_priv *priv = netdev_priv(dev); int ret = 0, i; + int max_speed; if (netif_running(dev)) stmmac_release(dev); @@ -7343,6 +7344,10 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) stmmac_mac_phylink_get_caps(priv); + max_speed = priv->plat->max_speed; + if (max_speed) + phylink_limit_mac_speed(&priv->phylink_config, max_speed); + stmmac_napi_add(dev); if (netif_running(dev)) From 9cb54af214a7cdc91577ec083e5569f2ce2c86d8 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:16 +0300 Subject: [PATCH 246/270] net: stmmac: Fix IP-cores specific MAC capabilities Here is the list of the MAC capabilities specific to the particular DW MAC IP-cores currently supported by the driver: DW MAC100: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 DW GMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 Allwinner sun8i MAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 DW QoS Eth: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD if there is more than 1 active Tx/Rx queues: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10FD | MAC_100FD | MAC_1000FD | MAC_2500FD DW XGMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_1000FD | MAC_2500FD | MAC_5000FD | MAC_10000FD DW XLGMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_1000FD | MAC_2500FD | MAC_5000FD | MAC_10000FD | MAC_25000FD | MAC_40000FD | MAC_50000FD | MAC_100000FD As you can see there are only two common capabilities: MAC_ASYM_PAUSE | MAC_SYM_PAUSE. Meanwhile what is currently implemented defines 10/100/1000 link speeds for all IP-cores, which is definitely incorrect for DW MAC100, DW XGMAC and DW XLGMAC devices. Seeing the flow-control is implemented as a callback for each MAC IP-core (see dwmac100_flow_ctrl(), dwmac1000_flow_ctrl(), sun8i_dwmac_flow_ctrl(), etc) and since the MAC-specific setup() method is supposed to be called for each available DW MAC-based device, the capabilities initialization can be freely moved to these setup() functions, thus correctly setting up the MAC-capabilities for each IP-core (including the Allwinner Sun8i). A new stmmac_link::caps field was specifically introduced for that so to have all link-specific info preserved in a single structure. Note the suggested change fixes three earlier commits at a time. The commit 5b0d7d7da64b ("net: stmmac: Add the missing speeds that XGMAC supports") permitted the 10-100 link speeds and 1G half-duplex mode for DW XGMAC IP-core even though it doesn't support them. The commit df7699c70c1b ("net: stmmac: Do not cut down 1G modes") incorrectly added the MAC1000 capability to the DW MAC100 IP-core. Similarly to the DW XGMAC the commit 8a880936e902 ("net: stmmac: Add XLGMII support") incorrectly permitted the 10-100 link speeds and 1G half-duplex mode for DW XLGMAC IP-core. Fixes: 5b0d7d7da64b ("net: stmmac: Add the missing speeds that XGMAC supports") Fixes: df7699c70c1b ("net: stmmac: Do not cut down 1G modes") Fixes: 8a880936e902 ("net: stmmac: Add XLGMII support") Suggested-by: Russell King (Oracle) Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/common.h | 1 + .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 ++ .../ethernet/stmicro/stmmac/dwmac1000_core.c | 2 ++ .../ethernet/stmicro/stmmac/dwmac100_core.c | 2 ++ .../net/ethernet/stmicro/stmmac/dwmac4_core.c | 10 ++++------ .../ethernet/stmicro/stmmac/dwxgmac2_core.c | 18 ++++++++---------- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 7 ++++--- 7 files changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index a6fefe675ef1..3b7d4ac1e7be 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -553,6 +553,7 @@ extern const struct stmmac_hwtimestamp stmmac_ptp; extern const struct stmmac_mode_ops dwmac4_ring_mode_ops; struct mac_link { + u32 caps; u32 speed_mask; u32 speed10; u32 speed100; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index b21d99faa2d0..e1b761dcfa1d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1096,6 +1096,8 @@ static struct mac_device_info *sun8i_dwmac_setup(void *ppriv) priv->dev->priv_flags |= IFF_UNICAST_FLT; + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000; /* The loopback bit seems to be re-set when link change * Simply mask it each time * Speed 10/100/1000 are set in BIT(2)/BIT(3) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 3927609abc44..8555299443f4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -539,6 +539,8 @@ int dwmac1000_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000; mac->link.duplex = GMAC_CONTROL_DM; mac->link.speed10 = GMAC_CONTROL_PS; mac->link.speed100 = GMAC_CONTROL_PS | GMAC_CONTROL_FES; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c index a6e8d7bd9588..7667d103cd0e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c @@ -175,6 +175,8 @@ int dwmac100_setup(struct stmmac_priv *priv) dev_info(priv->device, "\tDWMAC100\n"); mac->pcsr = priv->ioaddr; + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100; mac->link.duplex = MAC_CONTROL_F; mac->link.speed10 = 0; mac->link.speed100 = 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index ec6a13e644b3..a38226d7cc6a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -70,14 +70,10 @@ static void dwmac4_core_init(struct mac_device_info *hw, static void dwmac4_phylink_get_caps(struct stmmac_priv *priv) { - priv->phylink_config.mac_capabilities |= MAC_2500FD; - if (priv->plat->tx_queues_to_use > 1) - priv->phylink_config.mac_capabilities &= - ~(MAC_10HD | MAC_100HD | MAC_1000HD); + priv->hw->link.caps &= ~(MAC_10HD | MAC_100HD | MAC_1000HD); else - priv->phylink_config.mac_capabilities |= - (MAC_10HD | MAC_100HD | MAC_1000HD); + priv->hw->link.caps |= (MAC_10HD | MAC_100HD | MAC_1000HD); } static void dwmac4_rx_queue_enable(struct mac_device_info *hw, @@ -1385,6 +1381,8 @@ int dwmac4_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD; mac->link.duplex = GMAC_CONFIG_DM; mac->link.speed10 = GMAC_CONFIG_PS; mac->link.speed100 = GMAC_CONFIG_FES | GMAC_CONFIG_PS; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index e841e312077e..f8e7775bb633 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -47,14 +47,6 @@ static void dwxgmac2_core_init(struct mac_device_info *hw, writel(XGMAC_INT_DEFAULT_EN, ioaddr + XGMAC_INT_EN); } -static void xgmac_phylink_get_caps(struct stmmac_priv *priv) -{ - priv->phylink_config.mac_capabilities |= MAC_2500FD | MAC_5000FD | - MAC_10000FD | MAC_25000FD | - MAC_40000FD | MAC_50000FD | - MAC_100000FD; -} - static void dwxgmac2_set_mac(void __iomem *ioaddr, bool enable) { u32 tx = readl(ioaddr + XGMAC_TX_CONFIG); @@ -1540,7 +1532,6 @@ static void dwxgmac3_fpe_configure(void __iomem *ioaddr, struct stmmac_fpe_cfg * const struct stmmac_ops dwxgmac210_ops = { .core_init = dwxgmac2_core_init, - .phylink_get_caps = xgmac_phylink_get_caps, .set_mac = dwxgmac2_set_mac, .rx_ipc = dwxgmac2_rx_ipc, .rx_queue_enable = dwxgmac2_rx_queue_enable, @@ -1601,7 +1592,6 @@ static void dwxlgmac2_rx_queue_enable(struct mac_device_info *hw, u8 mode, const struct stmmac_ops dwxlgmac2_ops = { .core_init = dwxgmac2_core_init, - .phylink_get_caps = xgmac_phylink_get_caps, .set_mac = dwxgmac2_set_mac, .rx_ipc = dwxgmac2_rx_ipc, .rx_queue_enable = dwxlgmac2_rx_queue_enable, @@ -1661,6 +1651,9 @@ int dwxgmac2_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_1000FD | MAC_2500FD | MAC_5000FD | + MAC_10000FD; mac->link.duplex = 0; mac->link.speed10 = XGMAC_CONFIG_SS_10_MII; mac->link.speed100 = XGMAC_CONFIG_SS_100_MII; @@ -1698,6 +1691,11 @@ int dwxlgmac2_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_1000FD | MAC_2500FD | MAC_5000FD | + MAC_10000FD | MAC_25000FD | + MAC_40000FD | MAC_50000FD | + MAC_100000FD; mac->link.duplex = 0; mac->link.speed1000 = XLGMAC_CONFIG_SS_1000; mac->link.speed2500 = XLGMAC_CONFIG_SS_2500; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b8a1f02398ee..7c6fb14b5555 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1225,12 +1225,11 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) xpcs_get_interfaces(priv->hw->xpcs, priv->phylink_config.supported_interfaces); - priv->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | - MAC_10 | MAC_100 | MAC_1000; - /* Get the MAC specific capabilities */ stmmac_mac_phylink_get_caps(priv); + priv->phylink_config.mac_capabilities = priv->hw->link.caps; + max_speed = priv->plat->max_speed; if (max_speed) phylink_limit_mac_speed(&priv->phylink_config, max_speed); @@ -7344,6 +7343,8 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) stmmac_mac_phylink_get_caps(priv); + priv->phylink_config.mac_capabilities = priv->hw->link.caps; + max_speed = priv->plat->max_speed; if (max_speed) phylink_limit_mac_speed(&priv->phylink_config, max_speed); From 428051600cb4e5a61d81aba3f8009b6c4f5e7582 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 15 Mar 2024 12:08:20 +0100 Subject: [PATCH 247/270] ice: tc: check src_vsi in case of traffic from VF In case of traffic going from the VF (so ingress for port representor) source VSI should be consider during packet classification. It is needed for hardware to not match packets from different ports with filters added on other port. It is only for "from VF" traffic, because other traffic direction doesn't have source VSI. Set correct ::src_vsi in rule_info to pass it to the hardware filter. For example this rule should drop only ipv4 packets from eth10, not from the others VF PRs. It is needed to check source VSI in this case. $tc filter add dev eth10 ingress protocol ip flower skip_sw action drop Fixes: 0d08a441fb1a ("ice: ndo_setup_tc implementation for PF") Reviewed-by: Jedrzej Jagielski Reviewed-by: Sridhar Samudrala Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index b890410a2bc0..49ed5fd7db10 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -28,6 +28,8 @@ ice_tc_count_lkups(u32 flags, struct ice_tc_flower_lyr_2_4_hdrs *headers, * - ICE_TC_FLWR_FIELD_VLAN_TPID (present if specified) * - Tunnel flag (present if tunnel) */ + if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS) + lkups_cnt++; if (flags & ICE_TC_FLWR_FIELD_TENANT_ID) lkups_cnt++; @@ -363,6 +365,11 @@ ice_tc_fill_rules(struct ice_hw *hw, u32 flags, /* Always add direction metadata */ ice_rule_add_direction_metadata(&list[ICE_TC_METADATA_LKUP_IDX]); + if (tc_fltr->direction == ICE_ESWITCH_FLTR_EGRESS) { + ice_rule_add_src_vsi_metadata(&list[i]); + i++; + } + rule_info->tun_type = ice_sw_type_from_tunnel(tc_fltr->tunnel_type); if (tc_fltr->tunnel_type != TNL_LAST) { i = ice_tc_fill_tunnel_outer(flags, tc_fltr, list, i); @@ -820,6 +827,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) /* specify the cookie as filter_rule_id */ rule_info.fltr_rule_id = fltr->cookie; + rule_info.src_vsi = vsi->idx; ret = ice_add_adv_rule(hw, list, lkups_cnt, &rule_info, &rule_added); if (ret == -EEXIST) { From 73278715725a8347032acf233082ca4eb31e6a56 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 15 Mar 2024 12:08:21 +0100 Subject: [PATCH 248/270] ice: tc: allow zero flags in parsing tc flower The check for flags is done to not pass empty lookups to adding switch rule functions. Since metadata is always added to lookups there is no need to check against the flag. It is also fixing the problem with such rule: $ tc filter add dev gtp_dev ingress protocol ip prio 0 flower \ enc_dst_port 2123 action drop Switch block in case of GTP can't parse the destination port, because it should always be set to GTP specific value. The same with ethertype. The result is that there is no other matching criteria than GTP tunnel. In this case flags is 0, rule can't be added only because of defensive check against flags. Fixes: 9a225f81f540 ("ice: Support GTP-U and GTP-C offload in switchdev") Reviewed-by: Wojciech Drewek Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index 49ed5fd7db10..bcbcfc67e560 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -779,7 +779,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) int ret; int i; - if (!flags || (flags & ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT)) { + if (flags & ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT) { NL_SET_ERR_MSG_MOD(fltr->extack, "Unsupported encap field(s)"); return -EOPNOTSUPP; } From 2cca35f5dd78b9f8297c879c5db5ab137c5d86c3 Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Tue, 9 Apr 2024 17:45:44 +0200 Subject: [PATCH 249/270] ice: Fix checking for unsupported keys on non-tunnel device Add missing FLOW_DISSECTOR_KEY_ENC_* checks to TC flower filter parsing. Without these checks, it would be possible to add filters with tunnel options on non-tunnel devices. enc_* options are only valid for tunnel devices. Example: devlink dev eswitch set $PF1_PCI mode switchdev echo 1 > /sys/class/net/$PF1/device/sriov_numvfs tc qdisc add dev $VF1_PR ingress ethtool -K $PF1 hw-tc-offload on tc filter add dev $VF1_PR ingress flower enc_ttl 12 skip_sw action drop Fixes: 9e300987d4a8 ("ice: VXLAN and Geneve TC support") Reviewed-by: Michal Swiatkowski Signed-off-by: Marcin Szycik Reviewed-by: Jacob Keller Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index bcbcfc67e560..688ccb0615ab 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -1489,7 +1489,10 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, (BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS))) { + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_OPTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL))) { NL_SET_ERR_MSG_MOD(fltr->extack, "Tunnel key used, but device isn't a tunnel"); return -EOPNOTSUPP; } else { From f8bbc07ac535593139c875ffa19af924b1084540 Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Sun, 14 Apr 2024 22:02:46 -0400 Subject: [PATCH 250/270] tun: limit printing rate when illegal packet received by tun dev vhost_worker will call tun call backs to receive packets. If too many illegal packets arrives, tun_do_read will keep dumping packet contents. When console is enabled, it will costs much more cpu time to dump packet and soft lockup will be detected. net_ratelimit mechanism can be used to limit the dumping rate. PID: 33036 TASK: ffff949da6f20000 CPU: 23 COMMAND: "vhost-32980" #0 [fffffe00003fce50] crash_nmi_callback at ffffffff89249253 #1 [fffffe00003fce58] nmi_handle at ffffffff89225fa3 #2 [fffffe00003fceb0] default_do_nmi at ffffffff8922642e #3 [fffffe00003fced0] do_nmi at ffffffff8922660d #4 [fffffe00003fcef0] end_repeat_nmi at ffffffff89c01663 [exception RIP: io_serial_in+20] RIP: ffffffff89792594 RSP: ffffa655314979e8 RFLAGS: 00000002 RAX: ffffffff89792500 RBX: ffffffff8af428a0 RCX: 0000000000000000 RDX: 00000000000003fd RSI: 0000000000000005 RDI: ffffffff8af428a0 RBP: 0000000000002710 R8: 0000000000000004 R9: 000000000000000f R10: 0000000000000000 R11: ffffffff8acbf64f R12: 0000000000000020 R13: ffffffff8acbf698 R14: 0000000000000058 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #5 [ffffa655314979e8] io_serial_in at ffffffff89792594 #6 [ffffa655314979e8] wait_for_xmitr at ffffffff89793470 #7 [ffffa65531497a08] serial8250_console_putchar at ffffffff897934f6 #8 [ffffa65531497a20] uart_console_write at ffffffff8978b605 #9 [ffffa65531497a48] serial8250_console_write at ffffffff89796558 #10 [ffffa65531497ac8] console_unlock at ffffffff89316124 #11 [ffffa65531497b10] vprintk_emit at ffffffff89317c07 #12 [ffffa65531497b68] printk at ffffffff89318306 #13 [ffffa65531497bc8] print_hex_dump at ffffffff89650765 #14 [ffffa65531497ca8] tun_do_read at ffffffffc0b06c27 [tun] #15 [ffffa65531497d38] tun_recvmsg at ffffffffc0b06e34 [tun] #16 [ffffa65531497d68] handle_rx at ffffffffc0c5d682 [vhost_net] #17 [ffffa65531497ed0] vhost_worker at ffffffffc0c644dc [vhost] #18 [ffffa65531497f10] kthread at ffffffff892d2e72 #19 [ffffa65531497f50] ret_from_fork at ffffffff89c0022f Fixes: ef3db4a59542 ("tun: avoid BUG, dump packet on GSO errors") Signed-off-by: Lei Chen Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Reviewed-by: Eric Dumazet Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20240415020247.2207781-1-lei.chen@smartx.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 0b3f21cba552..92da8c03d960 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2125,14 +2125,16 @@ static ssize_t tun_put_user(struct tun_struct *tun, tun_is_little_endian(tun), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); - pr_err("unexpected GSO type: " - "0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), - tun16_to_cpu(tun, gso.hdr_len)); - print_hex_dump(KERN_ERR, "tun: ", - DUMP_PREFIX_NONE, - 16, 1, skb->head, - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + + if (net_ratelimit()) { + netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", + sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), + tun16_to_cpu(tun, gso.hdr_len)); + print_hex_dump(KERN_ERR, "tun: ", + DUMP_PREFIX_NONE, + 16, 1, skb->head, + min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + } WARN_ON_ONCE(1); return -EINVAL; } From d59cf049c8378677053703e724808836f180888e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 13 Apr 2024 16:01:39 +0300 Subject: [PATCH 251/270] net: dsa: mt7530: fix mirroring frames received on local port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This switch intellectual property provides a bit on the ARL global control register which controls allowing mirroring frames which are received on the local port (monitor port). This bit is unset after reset. This ability must be enabled to fully support the port mirroring feature on this switch intellectual property. Therefore, this patch fixes the traffic not being reflected on a port, which would be configured like below: tc qdisc add dev swp0 clsact tc filter add dev swp0 ingress matchall skip_sw \ action mirred egress mirror dev swp0 As a side note, this configuration provides the hairpinning feature for a single port. Fixes: 37feab6076aa ("net: dsa: mt7530: add support for port mirroring") Signed-off-by: Arınç ÜNAL Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 6 ++++++ drivers/net/dsa/mt7530.h | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index c0d0bce0b594..b84e1845fa02 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2480,6 +2480,9 @@ mt7530_setup(struct dsa_switch *ds) PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT)); } + /* Allow mirroring frames received on the local port (monitor port). */ + mt7530_set(priv, MT753X_AGC, LOCAL_EN); + /* Setup VLAN ID 0 for VLAN-unaware bridges */ ret = mt7530_setup_vlan0(priv); if (ret) @@ -2591,6 +2594,9 @@ mt7531_setup_common(struct dsa_switch *ds) PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT)); } + /* Allow mirroring frames received on the local port (monitor port). */ + mt7530_set(priv, MT753X_AGC, LOCAL_EN); + /* Flush the FDB table */ ret = mt7530_fdb_cmd(priv, MT7530_FDB_FLUSH, NULL); if (ret < 0) diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 585db03c0548..a08053390b28 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -32,6 +32,10 @@ enum mt753x_id { #define SYSC_REG_RSTCTRL 0x34 #define RESET_MCM BIT(2) +/* Register for ARL global control */ +#define MT753X_AGC 0xc +#define LOCAL_EN BIT(7) + /* Registers to mac forward control for unknown frames */ #define MT7530_MFC 0x10 #define BC_FFP(x) (((x) & 0xff) << 24) From 2c606d138518cc69f09c35929abc414a99e3a28f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 13 Apr 2024 16:01:40 +0300 Subject: [PATCH 252/270] net: dsa: mt7530: fix port mirroring for MT7988 SoC switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "MT7988A Wi-Fi 7 Generation Router Platform: Datasheet (Open Version) v0.1" document shows bits 16 to 18 as the MIRROR_PORT field of the CPU forward control register. Currently, the MT7530 DSA subdriver configures bits 0 to 2 of the CPU forward control register which breaks the port mirroring feature for the MT7988 SoC switch. Fix this by using the MT7531_MIRROR_PORT_GET() and MT7531_MIRROR_PORT_SET() macros which utilise the correct bits. Fixes: 110c18bfed41 ("net: dsa: mt7530: introduce driver for MT7988 built-in switch") Signed-off-by: Arınç ÜNAL Acked-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index b84e1845fa02..8090390edaf9 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1883,14 +1883,16 @@ mt7530_port_vlan_del(struct dsa_switch *ds, int port, static int mt753x_mirror_port_get(unsigned int id, u32 val) { - return (id == ID_MT7531) ? MT7531_MIRROR_PORT_GET(val) : - MIRROR_PORT(val); + return (id == ID_MT7531 || id == ID_MT7988) ? + MT7531_MIRROR_PORT_GET(val) : + MIRROR_PORT(val); } static int mt753x_mirror_port_set(unsigned int id, u32 val) { - return (id == ID_MT7531) ? MT7531_MIRROR_PORT_SET(val) : - MIRROR_PORT(val); + return (id == ID_MT7531 || id == ID_MT7988) ? + MT7531_MIRROR_PORT_SET(val) : + MIRROR_PORT(val); } static int mt753x_port_mirror_add(struct dsa_switch *ds, int port, From e871abcda3b67d0820b4182ebe93435624e9c6a4 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 17 Apr 2024 13:38:29 +0200 Subject: [PATCH 253/270] random: handle creditable entropy from atomic process context The entropy accounting changes a static key when the RNG has initialized, since it only ever initializes once. Static key changes, however, cannot be made from atomic context, so depending on where the last creditable entropy comes from, the static key change might need to be deferred to a worker. Previously the code used the execute_in_process_context() helper function, which accounts for whether or not the caller is in_interrupt(). However, that doesn't account for the case where the caller is actually in process context but is holding a spinlock. This turned out to be the case with input_handle_event() in drivers/input/input.c contributing entropy: [] die+0xa8/0x2fc [] bug_handler+0x44/0xec [] brk_handler+0x90/0x144 [] do_debug_exception+0xa0/0x148 [] el1_dbg+0x60/0x7c [] el1h_64_sync_handler+0x38/0x90 [] el1h_64_sync+0x64/0x6c [] __might_resched+0x1fc/0x2e8 [] __might_sleep+0x44/0x7c [] cpus_read_lock+0x1c/0xec [] static_key_enable+0x14/0x38 [] crng_set_ready+0x14/0x28 [] execute_in_process_context+0xb8/0xf8 [] _credit_init_bits+0x118/0x1dc [] add_timer_randomness+0x264/0x270 [] add_input_randomness+0x38/0x48 [] input_handle_event+0x2b8/0x490 [] input_event+0x6c/0x98 According to Guoyong, it's not really possible to refactor the various drivers to never hold a spinlock there. And in_atomic() isn't reliable. So, rather than trying to be too fancy, just punt the change in the static key to a workqueue always. There's basically no drawback of doing this, as the code already needed to account for the static key not changing immediately, and given that it's just an optimization, there's not exactly a hurry to change the static key right away, so deferal is fine. Reported-by: Guoyong Wang Cc: stable@vger.kernel.org Fixes: f5bda35fba61 ("random: use static branch for crng_ready()") Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 456be28ba67c..2597cb43f438 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -702,7 +702,7 @@ static void extract_entropy(void *buf, size_t len) static void __cold _credit_init_bits(size_t bits) { - static struct execute_work set_ready; + static DECLARE_WORK(set_ready, crng_set_ready); unsigned int new, orig, add; unsigned long flags; @@ -718,8 +718,8 @@ static void __cold _credit_init_bits(size_t bits) if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) { crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */ - if (static_key_initialized) - execute_in_process_context(crng_set_ready, &set_ready); + if (static_key_initialized && system_unbound_wq) + queue_work(system_unbound_wq, &set_ready); atomic_notifier_call_chain(&random_ready_notifier, 0, NULL); wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); @@ -890,8 +890,8 @@ void __init random_init(void) /* * If we were initialized by the cpu or bootloader before jump labels - * are initialized, then we should enable the static branch here, where - * it's guaranteed that jump labels have been initialized. + * or workqueues are initialized, then we should enable the static + * branch here, where it's guaranteed that these have been initialized. */ if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY) crng_set_ready(NULL); From 83781384a96b95e2b6403d3c8a002b2c89031770 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Mon, 15 Apr 2024 15:15:07 +0200 Subject: [PATCH 254/270] s390/ism: Properly fix receive message buffer allocation Since [1], dma_alloc_coherent() does not accept requests for GFP_COMP anymore, even on archs that may be able to fulfill this. Functionality that relied on the receive buffer being a compound page broke at that point: The SMC-D protocol, that utilizes the ism device driver, passes receive buffers to the splice processor in a struct splice_pipe_desc with a single entry list of struct pages. As the buffer is no longer a compound page, the splice processor now rejects requests to handle more than a page worth of data. Replace dma_alloc_coherent() and allocate a buffer with folio_alloc and create a DMA map for it with dma_map_page(). Since only receive buffers on ISM devices use DMA, qualify the mapping as FROM_DEVICE. Since ISM devices are available on arch s390, only, and on that arch all DMA is coherent, there is no need to introduce and export some kind of dma_sync_to_cpu() method to be called by the SMC-D protocol layer. Analogously, replace dma_free_coherent by a two step dma_unmap_page, then folio_put to free the receive buffer. [1] https://lore.kernel.org/all/20221113163535.884299-1-hch@lst.de/ Fixes: c08004eede4b ("s390/ism: don't pass bogus GFP_ flags to dma_alloc_coherent") Signed-off-by: Gerd Bayer Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2c8e964425dc..43778b088ffa 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -292,13 +292,16 @@ out: static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); - dma_free_coherent(&ism->pdev->dev, dmb->dmb_len, - dmb->cpu_addr, dmb->dma_addr); + dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, + DMA_FROM_DEVICE); + folio_put(virt_to_folio(dmb->cpu_addr)); } static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { + struct folio *folio; unsigned long bit; + int rc; if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; @@ -315,14 +318,30 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) return -EINVAL; - dmb->cpu_addr = dma_alloc_coherent(&ism->pdev->dev, dmb->dmb_len, - &dmb->dma_addr, - GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!dmb->cpu_addr) - clear_bit(dmb->sba_idx, ism->sba_bitmap); + folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | + __GFP_NORETRY, get_order(dmb->dmb_len)); - return dmb->cpu_addr ? 0 : -ENOMEM; + if (!folio) { + rc = -ENOMEM; + goto out_bit; + } + + dmb->cpu_addr = folio_address(folio); + dmb->dma_addr = dma_map_page(&ism->pdev->dev, + virt_to_page(dmb->cpu_addr), 0, + dmb->dmb_len, DMA_FROM_DEVICE); + if (dma_mapping_error(&ism->pdev->dev, dmb->dma_addr)) { + rc = -ENOMEM; + goto out_free; + } + + return 0; + +out_free: + kfree(dmb->cpu_addr); +out_bit: + clear_bit(dmb->sba_idx, ism->sba_bitmap); + return rc; } int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, From f609e7b1b49e4d15cf107d2069673ee63860c398 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:10:46 -0500 Subject: [PATCH 255/270] platform/x86/amd/pmc: Extend Framework 13 quirk to more BIOSes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BIOS 03.05 still hasn't fixed the spurious IRQ1 issue. As it's still being worked on there is still a possibility that it won't need to apply to future BIOS releases. Add a quirk for BIOS 03.05 as well. Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240410141046.433-1-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc-quirks.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index b456370166b6..b4f49720c87f 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -208,6 +208,15 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_BIOS_VERSION, "03.03"), } }, + { + .ident = "Framework Laptop 13 (Phoenix)", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Framework"), + DMI_MATCH(DMI_PRODUCT_NAME, "Laptop 13 (AMD Ryzen 7040Series)"), + DMI_MATCH(DMI_BIOS_VERSION, "03.05"), + } + }, {} }; From efefd4f00c967d00ad7abe092554ffbb70c1a793 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:01 +0200 Subject: [PATCH 256/270] netfilter: nf_tables: missing iterator type in lookup walk Add missing decorator type to lookup expression and tighten WARN_ON_ONCE check in pipapo to spot earlier that this is unset. Fixes: 29b359cf6d95 ("netfilter: nft_set_pipapo: walk over current view on netlink dump") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_lookup.c | 1 + net/netfilter/nft_set_pipapo.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index a0055f510e31..b314ca728a29 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -216,6 +216,7 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, return 0; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index eeaf05ffba95..0f903d18bbea 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2123,7 +2123,8 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_pipapo_field *f; unsigned int i, r; - WARN_ON_ONCE(iter->type == NFT_ITER_UNSPEC); + WARN_ON_ONCE(iter->type != NFT_ITER_READ && + iter->type != NFT_ITER_UPDATE); rcu_read_lock(); if (iter->type == NFT_ITER_READ) From e79b47a8615d42c68aaeb68971593333667382ed Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:11 +0200 Subject: [PATCH 257/270] netfilter: nf_tables: restore set elements when delete set fails From abort path, nft_mapelem_activate() needs to restore refcounters to the original state. Currently, it uses the set->ops->walk() to iterate over these set elements. The existing set iterator skips inactive elements in the next generation, this does not work from the abort path to restore the original state since it has to skip active elements instead (not inactive ones). This patch moves the check for inactive elements to the set iterator callback, then it reverses the logic for the .activate case which needs to skip active elements. Toggle next generation bit for elements when delete set command is invoked and call nft_clear() from .activate (abort) path to restore the next generation bit. The splat below shows an object in mappings memleak: [43929.457523] ------------[ cut here ]------------ [43929.457532] WARNING: CPU: 0 PID: 1139 at include/net/netfilter/nf_tables.h:1237 nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [...] [43929.458014] RIP: 0010:nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458076] Code: 83 f8 01 77 ab 49 8d 7c 24 08 e8 37 5e d0 de 49 8b 6c 24 08 48 8d 7d 50 e8 e9 5c d0 de 8b 45 50 8d 50 ff 89 55 50 85 c0 75 86 <0f> 0b eb 82 0f 0b eb b3 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 [43929.458081] RSP: 0018:ffff888140f9f4b0 EFLAGS: 00010246 [43929.458086] RAX: 0000000000000000 RBX: ffff8881434f5288 RCX: dffffc0000000000 [43929.458090] RDX: 00000000ffffffff RSI: ffffffffa26d28a7 RDI: ffff88810ecc9550 [43929.458093] RBP: ffff88810ecc9500 R08: 0000000000000001 R09: ffffed10281f3e8f [43929.458096] R10: 0000000000000003 R11: ffff0000ffff0000 R12: ffff8881434f52a0 [43929.458100] R13: ffff888140f9f5f4 R14: ffff888151c7a800 R15: 0000000000000002 [43929.458103] FS: 00007f0c687c4740(0000) GS:ffff888390800000(0000) knlGS:0000000000000000 [43929.458107] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43929.458111] CR2: 00007f58dbe5b008 CR3: 0000000123602005 CR4: 00000000001706f0 [43929.458114] Call Trace: [43929.458118] [43929.458121] ? __warn+0x9f/0x1a0 [43929.458127] ? nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458188] ? report_bug+0x1b1/0x1e0 [43929.458196] ? handle_bug+0x3c/0x70 [43929.458200] ? exc_invalid_op+0x17/0x40 [43929.458211] ? nft_setelem_data_deactivate+0xd7/0xf0 [nf_tables] [43929.458271] ? nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458332] nft_mapelem_deactivate+0x24/0x30 [nf_tables] [43929.458392] nft_rhash_walk+0xdd/0x180 [nf_tables] [43929.458453] ? __pfx_nft_rhash_walk+0x10/0x10 [nf_tables] [43929.458512] ? rb_insert_color+0x2e/0x280 [43929.458520] nft_map_deactivate+0xdc/0x1e0 [nf_tables] [43929.458582] ? __pfx_nft_map_deactivate+0x10/0x10 [nf_tables] [43929.458642] ? __pfx_nft_mapelem_deactivate+0x10/0x10 [nf_tables] [43929.458701] ? __rcu_read_unlock+0x46/0x70 [43929.458709] nft_delset+0xff/0x110 [nf_tables] [43929.458769] nft_flush_table+0x16f/0x460 [nf_tables] [43929.458830] nf_tables_deltable+0x501/0x580 [nf_tables] Fixes: 628bd3e49cba ("netfilter: nf_tables: drop map element references from preparation phase") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 44 ++++++++++++++++++++++++++++++---- net/netfilter/nft_set_bitmap.c | 4 +--- net/netfilter/nft_set_hash.c | 8 ++----- net/netfilter/nft_set_pipapo.c | 5 +--- net/netfilter/nft_set_rbtree.c | 4 +--- 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a7a34db62ea9..d0c09f899e80 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -594,6 +594,12 @@ static int nft_mapelem_deactivate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, elem_priv); return 0; @@ -617,6 +623,7 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, catchall->elem); break; } @@ -3880,6 +3887,9 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_data *data; int err; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; @@ -3903,17 +3913,20 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { - u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_iter dummy_iter = { + .genmask = nft_genmask_next(ctx->net), + }; struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) + if (!nft_set_elem_active(ext, dummy_iter.genmask)) continue; - ret = nft_setelem_validate(ctx, set, NULL, catchall->elem); + ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem); if (ret < 0) return ret; } @@ -5402,6 +5415,11 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + return nft_setelem_data_validate(ctx, set, elem_priv); } @@ -5494,6 +5512,13 @@ static int nft_mapelem_activate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + /* called from abort path, reverse check to undo changes. */ + if (nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, elem_priv); return 0; @@ -5511,6 +5536,7 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, catchall->elem); break; } @@ -5785,6 +5811,9 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; @@ -6635,7 +6664,7 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_setelem_is_catchall(set, elem_priv)) { - nft_set_elem_change_active(net, set, ext); + nft_clear(net, ext); } else { set->ops->activate(net, set, elem_priv); } @@ -7317,8 +7346,12 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_trans *trans; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, sizeof(struct nft_trans_elem), GFP_ATOMIC); if (!trans) @@ -10800,6 +10833,9 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 32df7a16835d..1caa04619dc6 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -172,7 +172,7 @@ static void nft_bitmap_activate(const struct net *net, nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); - nft_set_elem_change_active(net, set, &be->ext); + nft_clear(net, &be->ext); } static void nft_bitmap_flush(const struct net *net, @@ -222,8 +222,6 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, list_for_each_entry_rcu(be, &priv->list, head) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&be->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &be->priv); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6968a3b34236..daa56dda737a 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -199,7 +199,7 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, { struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_rhash_flush(const struct net *net, @@ -286,8 +286,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) @@ -599,7 +597,7 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set, { struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_hash_flush(const struct net *net, @@ -652,8 +650,6 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, hlist_for_each_entry_rcu(he, &priv->table[i], node) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 0f903d18bbea..187138afac45 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1847,7 +1847,7 @@ static void nft_pipapo_activate(const struct net *net, { struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &e->ext); + nft_clear(net, &e->ext); } /** @@ -2149,9 +2149,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, e = f->mt[r].e; - if (!nft_set_elem_active(&e->ext, iter->genmask)) - goto cont; - iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) goto out; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9944fe479e53..b7ea21327549 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -532,7 +532,7 @@ static void nft_rbtree_activate(const struct net *net, { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &rbe->ext); + nft_clear(net, &rbe->ext); } static void nft_rbtree_flush(const struct net *net, @@ -600,8 +600,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&rbe->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); if (iter->err < 0) { From 69ffed4b62523bbc85511f150500329d28aba356 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 17 Apr 2024 17:19:13 +0300 Subject: [PATCH 258/270] gpiolib: swnode: Remove wrong header inclusion The flags in the software node properties are supposed to be the GPIO lookup flags, which are provided by gpio/machine.h, as the software nodes are the kernel internal thing and doesn't need to rely to any of ABIs. Fixes: e7f9ff5dc90c ("gpiolib: add support for software nodes") Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/property.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/gpio/property.h b/include/linux/gpio/property.h index 6c75c8bd44a0..1a14e239221f 100644 --- a/include/linux/gpio/property.h +++ b/include/linux/gpio/property.h @@ -2,7 +2,6 @@ #ifndef __LINUX_GPIO_PROPERTY_H #define __LINUX_GPIO_PROPERTY_H -#include /* for GPIO_* flags */ #include #define PROPERTY_ENTRY_GPIO(_name_, _chip_node_, _idx_, _flags_) \ From 86a1471d7cde792941109b93b558b5dc078b9ee9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:21 +0200 Subject: [PATCH 259/270] netfilter: nf_tables: fix memleak in map from abort path The delete set command does not rely on the transaction object for element removal, therefore, a combination of delete element + delete set from the abort path could result in restoring twice the refcount of the mapping. Check for inactive element in the next generation for the delete element command in the abort path, skip restoring state if next generation bit has been already cleared. This is similar to the activate logic using the set walk iterator. [ 6170.286929] ------------[ cut here ]------------ [ 6170.286939] WARNING: CPU: 6 PID: 790302 at net/netfilter/nf_tables_api.c:2086 nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.287071] Modules linked in: [...] [ 6170.287633] CPU: 6 PID: 790302 Comm: kworker/6:2 Not tainted 6.9.0-rc3+ #365 [ 6170.287768] RIP: 0010:nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.287886] Code: df 48 8d 7d 58 e8 69 2e 3b df 48 8b 7d 58 e8 80 1b 37 df 48 8d 7d 68 e8 57 2e 3b df 48 8b 7d 68 e8 6e 1b 37 df 48 89 ef eb c4 <0f> 0b 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 0f [ 6170.287895] RSP: 0018:ffff888134b8fd08 EFLAGS: 00010202 [ 6170.287904] RAX: 0000000000000001 RBX: ffff888125bffb28 RCX: dffffc0000000000 [ 6170.287912] RDX: 0000000000000003 RSI: ffffffffa20298ab RDI: ffff88811ebe4750 [ 6170.287919] RBP: ffff88811ebe4700 R08: ffff88838e812650 R09: fffffbfff0623a55 [ 6170.287926] R10: ffffffff8311d2af R11: 0000000000000001 R12: ffff888125bffb10 [ 6170.287933] R13: ffff888125bffb10 R14: dead000000000122 R15: dead000000000100 [ 6170.287940] FS: 0000000000000000(0000) GS:ffff888390b00000(0000) knlGS:0000000000000000 [ 6170.287948] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6170.287955] CR2: 00007fd31fc00710 CR3: 0000000133f60004 CR4: 00000000001706f0 [ 6170.287962] Call Trace: [ 6170.287967] [ 6170.287973] ? __warn+0x9f/0x1a0 [ 6170.287986] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288092] ? report_bug+0x1b1/0x1e0 [ 6170.287986] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288092] ? report_bug+0x1b1/0x1e0 [ 6170.288104] ? handle_bug+0x3c/0x70 [ 6170.288112] ? exc_invalid_op+0x17/0x40 [ 6170.288120] ? asm_exc_invalid_op+0x1a/0x20 [ 6170.288132] ? nf_tables_chain_destroy+0x2b/0x220 [nf_tables] [ 6170.288243] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288366] ? nf_tables_chain_destroy+0x2b/0x220 [nf_tables] [ 6170.288483] nf_tables_trans_destroy_work+0x588/0x590 [nf_tables] Fixes: 591054469b3e ("netfilter: nf_tables: revisit chain/object refcounting from elements") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d0c09f899e80..167074283ea9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7223,6 +7223,16 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) } } +static int nft_setelem_active_next(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + u8 genmask = nft_genmask_next(net); + + return nft_set_elem_active(ext, genmask); +} + static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) @@ -10644,8 +10654,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; - nft_setelem_data_activate(net, te->set, te->elem_priv); - nft_setelem_activate(net, te->set, te->elem_priv); + if (!nft_setelem_active_next(net, te->set, te->elem_priv)) { + nft_setelem_data_activate(net, te->set, te->elem_priv); + nft_setelem_activate(net, te->set, te->elem_priv); + } if (!nft_setelem_is_catchall(te->set, te->elem_priv)) te->set->ndeact--; From 0f022d32c3eca477fbf79a205243a6123ed0fe11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Apr 2024 18:07:28 -0300 Subject: [PATCH 260/270] net/sched: Fix mirred deadlock on device recursion When the mirred action is used on a classful egress qdisc and a packet is mirrored or redirected to self we hit a qdisc lock deadlock. See trace below. [..... other info removed for brevity....] [ 82.890906] [ 82.890906] ============================================ [ 82.890906] WARNING: possible recursive locking detected [ 82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G W [ 82.890906] -------------------------------------------- [ 82.890906] ping/418 is trying to acquire lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] but task is already holding lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] other info that might help us debug this: [ 82.890906] Possible unsafe locking scenario: [ 82.890906] [ 82.890906] CPU0 [ 82.890906] ---- [ 82.890906] lock(&sch->q.lock); [ 82.890906] lock(&sch->q.lock); [ 82.890906] [ 82.890906] *** DEADLOCK *** [ 82.890906] [..... other info removed for brevity....] Example setup (eth0->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 Another example(eth0->eth1->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth1 tc qdisc add dev eth1 root handle 1: htb default 30 tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 We fix this by adding an owner field (CPU id) to struct Qdisc set after root qdisc is entered. When the softirq enters it a second time, if the qdisc owner is the same CPU, the packet is dropped to break the loop. Reported-by: Mingshuai Ren Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/ Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()") Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops") Signed-off-by: Eric Dumazet Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240415210728.36949-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 1 + net/core/dev.c | 6 ++++++ net/sched/sch_generic.c | 1 + 3 files changed, 8 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cefe0c4bdae3..41ca14e81d55 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -117,6 +117,7 @@ struct Qdisc { struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; + int owner; unsigned long state; unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; diff --git a/net/core/dev.c b/net/core/dev.c index 984ff8b9d0e1..331848eca7d3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3775,6 +3775,10 @@ no_lock_out: return rc; } + if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { + kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); + return NET_XMIT_DROP; + } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3814,7 +3818,9 @@ no_lock_out: qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { + WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ff5336493777..4a2c763e2d11 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -974,6 +974,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; + sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); From caed8eba221533123192d39cc947f45cbb1e1db5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Apr 2024 08:10:48 -0700 Subject: [PATCH 261/270] selftests: kselftest_harness: fix Clang warning about zero-length format Apparently it's more legal to pass the format as NULL, than it is to use an empty string. Clang complains about empty formats: ./../kselftest_harness.h:1207:30: warning: format string is empty [-Wformat-zero-length] 1207 | diagnostic ? "%s" : "", diagnostic); | ^~ 1 warning generated. Reported-by: Sean Christopherson Link: https://lore.kernel.org/all/20240409224256.1581292-1-seanjc@google.com Fixes: 378193eff339 ("selftests: kselftest_harness: let PASS / FAIL provide diagnostic") Tested-by: Sean Christopherson Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240416151048.1682352-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/kselftest.h | 10 ++++++---- tools/testing/selftests/kselftest_harness.h | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 541bf192e30e..4eca3fd1292c 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -288,15 +288,17 @@ void ksft_test_result_code(int exit_code, const char *test_name, } /* Docs seem to call for double space if directive is absent */ - if (!directive[0] && msg[0]) + if (!directive[0] && msg) directive = " # "; - va_start(args, msg); printf("%s %u %s%s", tap_code, ksft_test_num(), test_name, directive); errno = saved_errno; - vprintf(msg, args); + if (msg) { + va_start(args, msg); + vprintf(msg, args); + va_end(args); + } printf("\n"); - va_end(args); } static inline int ksft_exit_pass(void) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4fd735e48ee7..adb15cae79ab 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -1202,7 +1202,7 @@ void __run_test(struct __fixture_metadata *f, diagnostic = "unknown"; ksft_test_result_code(t->exit_code, test_name, - diagnostic ? "%s" : "", diagnostic); + diagnostic ? "%s" : NULL, diagnostic); } static int test_harness_run(int argc, char **argv) From d362046021ea122309da8c8e0b6850c792ca97b5 Mon Sep 17 00:00:00 2001 From: Vanillan Wang Date: Tue, 16 Apr 2024 20:07:13 +0800 Subject: [PATCH 262/270] net:usb:qmi_wwan: support Rolling modules Update the qmi_wwan driver support for the Rolling LTE modules. - VID:PID 33f8:0104, RW101-GL for laptop debug M.2 cards(with RMNET interface for /Linux/Chrome OS) 0x0104: RMNET, diag, at, pipe Here are the outputs of usb-devices: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=33f8 ProdID=0104 Rev=05.04 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=ba2eb033 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: Vanillan Wang Link: https://lore.kernel.org/r/20240416120713.24777-1-vanillanwang@163.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index e2e181378f41..edc34402e787 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1431,6 +1431,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2692, 0x9025, 4)}, /* Cellient MPL200 (rebranded Qualcomm 05c6:9025) */ {QMI_QUIRK_SET_DTR(0x1546, 0x1312, 4)}, /* u-blox LARA-R6 01B */ {QMI_QUIRK_SET_DTR(0x1546, 0x1342, 4)}, /* u-blox LARA-L6 */ + {QMI_QUIRK_SET_DTR(0x33f8, 0x0104, 4)}, /* Rolling RW101 RMNET */ /* 4. Gobi 1000 devices */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ From 94667949ec3bbb2218c46ad0a0e7274c8832e494 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 16 Apr 2024 10:23:29 +0200 Subject: [PATCH 263/270] net: ethernet: mtk_eth_soc: fix WED + wifi reset The WLAN + WED reset sequence relies on being able to receive interrupts from the card, in order to synchronize individual steps with the firmware. When WED is stopped, leave interrupts running and rely on the driver turning off unwanted ones. WED DMA also needs to be disabled before resetting. Fixes: f78cd9c783e0 ("net: ethernet: mtk_wed: update mtk_wed_stop") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20240416082330.82564-1-nbd@nbd.name Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_wed.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index c895e265ae0e..61334a71058c 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -1074,13 +1074,13 @@ mtk_wed_dma_disable(struct mtk_wed_device *dev) static void mtk_wed_stop(struct mtk_wed_device *dev) { + mtk_wed_dma_disable(dev); mtk_wed_set_ext_int(dev, false); wed_w32(dev, MTK_WED_WPDMA_INT_TRIGGER, 0); wed_w32(dev, MTK_WED_WDMA_INT_TRIGGER, 0); wdma_w32(dev, MTK_WDMA_INT_MASK, 0); wdma_w32(dev, MTK_WDMA_INT_GRP2, 0); - wed_w32(dev, MTK_WED_WPDMA_INT_MASK, 0); if (!mtk_wed_get_rx_capa(dev)) return; @@ -1093,7 +1093,6 @@ static void mtk_wed_deinit(struct mtk_wed_device *dev) { mtk_wed_stop(dev); - mtk_wed_dma_disable(dev); wed_clr(dev, MTK_WED_CTRL, MTK_WED_CTRL_WDMA_INT_AGENT_EN | @@ -2605,9 +2604,6 @@ mtk_wed_irq_get(struct mtk_wed_device *dev, u32 mask) static void mtk_wed_irq_set_mask(struct mtk_wed_device *dev, u32 mask) { - if (!dev->running) - return; - mtk_wed_set_ext_int(dev, !!mask); wed_w32(dev, MTK_WED_INT_MASK, mask); } From def52db470df28d6f43cacbd21137f03b9502073 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:51 +0100 Subject: [PATCH 264/270] net: ravb: Count packets instead of descriptors in R-Car RX path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The units of "work done" in the RX path should be packets instead of descriptors. Descriptors which are used by the hardware to record error conditions or are empty in the case of a DMA mapping error should not count towards our RX work budget. Also make the limit variable unsigned as it can never be negative. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Reviewed-by: Niklas Söderlund Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index ba01c8cc3c90..7d231d011bff 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -892,29 +892,24 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) struct ravb_private *priv = netdev_priv(ndev); const struct ravb_hw_info *info = priv->info; int entry = priv->cur_rx[q] % priv->num_rx_ring[q]; - int boguscnt = (priv->dirty_rx[q] + priv->num_rx_ring[q]) - - priv->cur_rx[q]; struct net_device_stats *stats = &priv->stats[q]; struct ravb_ex_rx_desc *desc; + unsigned int limit, i; struct sk_buff *skb; dma_addr_t dma_addr; struct timespec64 ts; + int rx_packets = 0; u8 desc_status; u16 pkt_len; - int limit; - boguscnt = min(boguscnt, *quota); - limit = boguscnt; + limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; desc = &priv->rx_ring[q].ex_desc[entry]; - while (desc->die_dt != DT_FEMPTY) { + for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; pkt_len = le16_to_cpu(desc->ds_cc) & RX_DS; - if (--boguscnt < 0) - break; - /* We use 0-byte descriptors to mark the DMA mapping errors */ if (!pkt_len) continue; @@ -960,7 +955,7 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum(skb); napi_gro_receive(&priv->napi[q], skb); - stats->rx_packets++; + rx_packets++; stats->rx_bytes += pkt_len; } @@ -995,9 +990,9 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) desc->die_dt = DT_FEMPTY; } - *quota -= limit - (++boguscnt); - - return boguscnt <= 0; + stats->rx_packets += rx_packets; + *quota -= rx_packets; + return *quota == 0; } /* Packet receive function for Ethernet AVB */ From a892493a343494bd6bab9d098593932077ff3c43 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:52 +0100 Subject: [PATCH 265/270] net: ravb: Allow RX loop to move past DMA mapping errors The RX loops in ravb_rx_gbeth() and ravb_rx_rcar() skip to the next loop iteration if a zero-length descriptor is seen (indicating a DMA mapping error). However, the current RX descriptor index `priv->cur_rx[q]` was incremented at the end of the loop and so would not be incremented when we skip to the next loop iteration. This would cause the loop to keep seeing the same zero-length descriptor instead of moving on to the next descriptor. As the loop counter `i` still increments, the loop would eventually terminate so there is no risk of being stuck here forever - but we should still fix this to avoid wasting cycles. To fix this, the RX descriptor index is incremented at the top of the loop, in the for statement itself. The assignments of `entry` and `desc` are brought into the loop to avoid the need for duplication. Fixes: d8b48911fd24 ("ravb: fix ring memory allocation") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 25 ++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 7d231d011bff..3b870926af14 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -775,12 +775,15 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) int limit; int i; - entry = priv->cur_rx[q] % priv->num_rx_ring[q]; limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; stats = &priv->stats[q]; - desc = &priv->rx_ring[q].desc[entry]; - for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { + for (i = 0; i < limit; i++, priv->cur_rx[q]++) { + entry = priv->cur_rx[q] % priv->num_rx_ring[q]; + desc = &priv->rx_ring[q].desc[entry]; + if (rx_packets == *quota || desc->die_dt == DT_FEMPTY) + break; + /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; @@ -848,9 +851,6 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) break; } } - - entry = (++priv->cur_rx[q]) % priv->num_rx_ring[q]; - desc = &priv->rx_ring[q].desc[entry]; } /* Refill the RX ring buffers. */ @@ -891,7 +891,6 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) { struct ravb_private *priv = netdev_priv(ndev); const struct ravb_hw_info *info = priv->info; - int entry = priv->cur_rx[q] % priv->num_rx_ring[q]; struct net_device_stats *stats = &priv->stats[q]; struct ravb_ex_rx_desc *desc; unsigned int limit, i; @@ -901,10 +900,15 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) int rx_packets = 0; u8 desc_status; u16 pkt_len; + int entry; limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; - desc = &priv->rx_ring[q].ex_desc[entry]; - for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { + for (i = 0; i < limit; i++, priv->cur_rx[q]++) { + entry = priv->cur_rx[q] % priv->num_rx_ring[q]; + desc = &priv->rx_ring[q].ex_desc[entry]; + if (rx_packets == *quota || desc->die_dt == DT_FEMPTY) + break; + /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; @@ -958,9 +962,6 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) rx_packets++; stats->rx_bytes += pkt_len; } - - entry = (++priv->cur_rx[q]) % priv->num_rx_ring[q]; - desc = &priv->rx_ring[q].ex_desc[entry]; } /* Refill the RX ring buffers. */ From c7c449502b51c5b5de79f97a42be750b28f6ecee Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:53 +0100 Subject: [PATCH 266/270] net: ravb: Fix GbEth jumbo packet RX checksum handling Sending a 7kB ping packet to the RZ/G2L in v6.9-rc2 causes the following backtrace: WARNING: CPU: 0 PID: 0 at include/linux/skbuff.h:3127 skb_trim+0x30/0x38 Hardware name: Renesas SMARC EVK based on r9a07g044l2 (DT) pc : skb_trim+0x30/0x38 lr : ravb_rx_csum_gbeth+0x40/0x90 Call trace: skb_trim+0x30/0x38 ravb_rx_gbeth+0x56c/0x5cc ravb_poll+0xa0/0x204 __napi_poll+0x38/0x17c This is caused by ravb_rx_gbeth() calling ravb_rx_csum_gbeth() with the wrong skb for a packet which spans multiple descriptors. To fix this, use the correct skb. Fixes: c2da9408579d ("ravb: Add Rx checksum offload support for GbEth") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 3b870926af14..6969cdeeb67a 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -843,7 +843,7 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) priv->rx_1st_skb->protocol = eth_type_trans(priv->rx_1st_skb, ndev); if (ndev->features & NETIF_F_RXCSUM) - ravb_rx_csum_gbeth(skb); + ravb_rx_csum_gbeth(priv->rx_1st_skb); napi_gro_receive(&priv->napi[q], priv->rx_1st_skb); rx_packets++; From 2e36c9fbc476f95a1b19e3fa0a2cdf408475ff56 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:54 +0100 Subject: [PATCH 267/270] net: ravb: Fix RX byte accounting for jumbo packets The RX byte accounting for jumbo packets was changed to fix a potential use-after-free bug. However, that fix used the wrong variable and so only accounted for the number of bytes in the final descriptor, not the number of bytes in the whole packet. To fix this, we can simply update our stats with the correct number of bytes before calling napi_gro_receive(). Also rename pkt_len to desc_len in ravb_rx_gbeth() to avoid any future confusion. The variable name pkt_len is correct in ravb_rx_rcar() as that function does not handle packets spanning multiple descriptors. Fixes: 5a5a3e564de6 ("ravb: Fix potential use-after-free in ravb_rx_gbeth()") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 6969cdeeb67a..fcb756d77681 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -769,7 +769,7 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) dma_addr_t dma_addr; int rx_packets = 0; u8 desc_status; - u16 pkt_len; + u16 desc_len; u8 die_dt; int entry; int limit; @@ -787,10 +787,10 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; - pkt_len = le16_to_cpu(desc->ds_cc) & RX_DS; + desc_len = le16_to_cpu(desc->ds_cc) & RX_DS; /* We use 0-byte descriptors to mark the DMA mapping errors */ - if (!pkt_len) + if (!desc_len) continue; if (desc_status & MSC_MC) @@ -811,25 +811,25 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) switch (die_dt) { case DT_FSINGLE: skb = ravb_get_skb_gbeth(ndev, entry, desc); - skb_put(skb, pkt_len); + skb_put(skb, desc_len); skb->protocol = eth_type_trans(skb, ndev); if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum_gbeth(skb); napi_gro_receive(&priv->napi[q], skb); rx_packets++; - stats->rx_bytes += pkt_len; + stats->rx_bytes += desc_len; break; case DT_FSTART: priv->rx_1st_skb = ravb_get_skb_gbeth(ndev, entry, desc); - skb_put(priv->rx_1st_skb, pkt_len); + skb_put(priv->rx_1st_skb, desc_len); break; case DT_FMID: skb = ravb_get_skb_gbeth(ndev, entry, desc); skb_copy_to_linear_data_offset(priv->rx_1st_skb, priv->rx_1st_skb->len, skb->data, - pkt_len); - skb_put(priv->rx_1st_skb, pkt_len); + desc_len); + skb_put(priv->rx_1st_skb, desc_len); dev_kfree_skb(skb); break; case DT_FEND: @@ -837,17 +837,17 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) skb_copy_to_linear_data_offset(priv->rx_1st_skb, priv->rx_1st_skb->len, skb->data, - pkt_len); - skb_put(priv->rx_1st_skb, pkt_len); + desc_len); + skb_put(priv->rx_1st_skb, desc_len); dev_kfree_skb(skb); priv->rx_1st_skb->protocol = eth_type_trans(priv->rx_1st_skb, ndev); if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum_gbeth(priv->rx_1st_skb); + stats->rx_bytes += priv->rx_1st_skb->len; napi_gro_receive(&priv->napi[q], priv->rx_1st_skb); rx_packets++; - stats->rx_bytes += pkt_len; break; } } From 3aadf100f93d80815685493d60cd8cab206403df Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 18 Apr 2024 13:45:17 +0200 Subject: [PATCH 268/270] Revert "vmgenid: emit uevent when VMGENID updates" This reverts commit ad6bcdad2b6724e113f191a12f859a9e8456b26d. I had nak'd it, and Greg said on the thread that it links that he wasn't going to take it either, especially since it's not his code or his tree, but then, seemingly accidentally, it got pushed up some months later, in what looks like a mistake, with no further discussion in the linked thread. So revert it, since it's clearly not intended. Fixes: ad6bcdad2b67 ("vmgenid: emit uevent when VMGENID updates") Cc: stable@vger.kernel.org Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230531095119.11202-2-bchalios@amazon.es Signed-off-by: Jason A. Donenfeld --- drivers/virt/vmgenid.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/virt/vmgenid.c b/drivers/virt/vmgenid.c index b67a28da4702..a1c467a0e9f7 100644 --- a/drivers/virt/vmgenid.c +++ b/drivers/virt/vmgenid.c @@ -68,7 +68,6 @@ out: static void vmgenid_notify(struct acpi_device *device, u32 event) { struct vmgenid_state *state = acpi_driver_data(device); - char *envp[] = { "NEW_VMGENID=1", NULL }; u8 old_id[VMGENID_SIZE]; memcpy(old_id, state->this_id, sizeof(old_id)); @@ -76,7 +75,6 @@ static void vmgenid_notify(struct acpi_device *device, u32 event) if (!memcmp(old_id, state->this_id, sizeof(old_id))) return; add_vmfork_randomness(state->this_id, sizeof(state->this_id)); - kobject_uevent_env(&device->dev.kobj, KOBJ_CHANGE, envp); } static const struct acpi_device_id vmgenid_ids[] = { From 56f78615bcb1c3ba58a5d9911bad3d9185cf141b Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Wed, 17 Apr 2024 10:55:13 +0200 Subject: [PATCH 269/270] net: usb: ax88179_178a: avoid writing the mac address before first reading After the commit d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets"), reset operation, in which the default mac address from the device is read, is not executed from bind operation and the random address, that is pregenerated just in case, is direclty written the first time in the device, so the default one from the device is not even read. This writing is not dangerous because is volatile and the default mac address is not missed. In order to avoid this and keep the simplification to have only one reset and reduce the delays, restore the reset from bind operation and remove the reset that is commanded from open operation. The behavior is the same but everything is ready for usbnet_probe. Tested with ASIX AX88179 USB Gigabit Ethernet devices. Restore the old behavior for the rest of possible devices because I don't have the hardware to test. cc: stable@vger.kernel.org # 6.6+ Fixes: d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets") Reported-by: Jarkko Palviainen Signed-off-by: Jose Ignacio Tornos Martinez Link: https://lore.kernel.org/r/20240417085524.219532-1-jtornosm@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/ax88179_178a.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index a9c418890a1c..752f821a1990 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1317,6 +1317,8 @@ static int ax88179_bind(struct usbnet *dev, struct usb_interface *intf) netif_set_tso_max_size(dev->net, 16384); + ax88179_reset(dev); + return 0; } @@ -1695,7 +1697,6 @@ static const struct driver_info ax88179_info = { .unbind = ax88179_unbind, .status = ax88179_status, .link_reset = ax88179_link_reset, - .reset = ax88179_reset, .stop = ax88179_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX, .rx_fixup = ax88179_rx_fixup, @@ -1708,7 +1709,6 @@ static const struct driver_info ax88178a_info = { .unbind = ax88179_unbind, .status = ax88179_status, .link_reset = ax88179_link_reset, - .reset = ax88179_reset, .stop = ax88179_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX, .rx_fixup = ax88179_rx_fixup, From c24cd679b075b0e953ea167b0aa2b2d59e4eba7f Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Wed, 17 Apr 2024 15:24:25 +0530 Subject: [PATCH 270/270] net: ethernet: ti: am65-cpsw-nuss: cleanup DMA Channels before using them The TX and RX DMA Channels used by the driver to exchange data with CPSW are not guaranteed to be in a clean state during driver initialization. The Bootloader could have used the same DMA Channels without cleaning them up in the event of failure. Thus, reset and disable the DMA Channels to ensure that they are in a clean state before using them. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Reported-by: Schuyler Patton Signed-off-by: Siddharth Vadapalli Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20240417095425.2253876-1-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2939a21ca74f..1d00e21808c1 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2793,6 +2793,8 @@ static void am65_cpsw_unregister_devlink(struct am65_cpsw_common *common) static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) { + struct am65_cpsw_rx_chn *rx_chan = &common->rx_chns; + struct am65_cpsw_tx_chn *tx_chan = common->tx_chns; struct device *dev = common->dev; struct am65_cpsw_port *port; int ret = 0, i; @@ -2805,6 +2807,22 @@ static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) if (ret) return ret; + /* The DMA Channels are not guaranteed to be in a clean state. + * Reset and disable them to ensure that they are back to the + * clean state and ready to be used. + */ + for (i = 0; i < common->tx_ch_num; i++) { + k3_udma_glue_reset_tx_chn(tx_chan[i].tx_chn, &tx_chan[i], + am65_cpsw_nuss_tx_cleanup); + k3_udma_glue_disable_tx_chn(tx_chan[i].tx_chn); + } + + for (i = 0; i < AM65_CPSW_MAX_RX_FLOWS; i++) + k3_udma_glue_reset_rx_chn(rx_chan->rx_chn, i, rx_chan, + am65_cpsw_nuss_rx_cleanup, !!i); + + k3_udma_glue_disable_rx_chn(rx_chan->rx_chn); + ret = am65_cpsw_nuss_register_devlink(common); if (ret) return ret;