From f85daf0e725358be78dfd208dea5fd665d8cb901 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Wed, 1 Jun 2022 14:46:25 +0800 Subject: [PATCH 001/248] xfrm: xfrm_policy: fix a possible double xfrm_pols_put() in xfrm_bundle_lookup() xfrm_policy_lookup() will call xfrm_pol_hold_rcu() to get a refcount of pols[0]. This refcount can be dropped in xfrm_expand_policies() when xfrm_expand_policies() return error. pols[0]'s refcount is balanced in here. But xfrm_bundle_lookup() will also call xfrm_pols_put() with num_pols == 1 to drop this refcount when xfrm_expand_policies() return error. This patch also fix an illegal address access. pols[0] will save a error point when xfrm_policy_lookup fails. This lead to xfrm_pols_put to resolve an illegal address in xfrm_bundle_lookup's error path. Fix these by setting num_pols = 0 in xfrm_expand_policies()'s error path. Fixes: 80c802f3073e ("xfrm: cache bundles instead of policies for outgoing flows") Signed-off-by: Hangyu Hua Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f1876ea61fdc..f1a0bab920a5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2678,8 +2678,10 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, *num_xfrms = 0; return 0; } - if (IS_ERR(pols[0])) + if (IS_ERR(pols[0])) { + *num_pols = 0; return PTR_ERR(pols[0]); + } *num_xfrms = pols[0]->xfrm_nr; @@ -2694,6 +2696,7 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); + *num_pols = 0; return PTR_ERR(pols[1]); } (*num_pols)++; From 16bc4d196b2a8960cac3bdfd4c98d72c44ab043b Mon Sep 17 00:00:00 2001 From: Peter Geis Date: Mon, 6 Jun 2022 12:30:23 -0400 Subject: [PATCH 002/248] arm64: dts: rockchip: Fix ethernet on production Quartz64-B The production Quartz64 Model B has compatibility issues when using rgmii-id mode. Switch to rgmii mode and use the SoC's delays to ensure full compatibility. Reported-by: Frank Mankel Fixes: dcc8c66bef79 ("arm64: dts: rockchip: add Pine64 Quartz64-B device tree") Signed-off-by: Peter Geis Tested-by: Frank Mankel Link: https://lore.kernel.org/r/20220606163023.3677147-1-pgwipeout@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts index 7bdcecc0dfe4..02d5f5a8ca03 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts @@ -133,7 +133,7 @@ assigned-clocks = <&cru SCLK_GMAC1_RX_TX>, <&cru SCLK_GMAC1_RGMII_SPEED>, <&cru SCLK_GMAC1>; assigned-clock-parents = <&cru SCLK_GMAC1_RGMII_SPEED>, <&cru SCLK_GMAC1>, <&gmac1_clkin>; clock_in_out = "input"; - phy-mode = "rgmii-id"; + phy-mode = "rgmii"; phy-supply = <&vcc_3v3>; pinctrl-names = "default"; pinctrl-0 = <&gmac1m1_miim From 0f5de2f0532229752d923c769a5b202ae437523b Mon Sep 17 00:00:00 2001 From: Gao Chao Date: Wed, 8 Jun 2022 10:42:49 +0800 Subject: [PATCH 003/248] power: supply: ab8500_fg: add missing destroy_workqueue in ab8500_fg_probe In ab8500_fg_probe, misses destroy_workqueue in error path, this patch fixes that. Fixes: 010ddb813f35 ("power: supply: ab8500_fg: Allocate wq in probe") Signed-off-by: Gao Chao Reviewed-by: Linus Walleij Signed-off-by: Sebastian Reichel --- drivers/power/supply/ab8500_fg.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/ab8500_fg.c b/drivers/power/supply/ab8500_fg.c index ec8a404d71b4..4339fa9ff009 100644 --- a/drivers/power/supply/ab8500_fg.c +++ b/drivers/power/supply/ab8500_fg.c @@ -3148,6 +3148,7 @@ static int ab8500_fg_probe(struct platform_device *pdev) ret = ab8500_fg_init_hw_registers(di); if (ret) { dev_err(dev, "failed to initialize registers\n"); + destroy_workqueue(di->fg_wq); return ret; } @@ -3159,6 +3160,7 @@ static int ab8500_fg_probe(struct platform_device *pdev) di->fg_psy = devm_power_supply_register(dev, &ab8500_fg_desc, &psy_cfg); if (IS_ERR(di->fg_psy)) { dev_err(dev, "failed to register FG psy\n"); + destroy_workqueue(di->fg_wq); return PTR_ERR(di->fg_psy); } @@ -3174,8 +3176,10 @@ static int ab8500_fg_probe(struct platform_device *pdev) /* Register primary interrupt handlers */ for (i = 0; i < ARRAY_SIZE(ab8500_fg_irq); i++) { irq = platform_get_irq_byname(pdev, ab8500_fg_irq[i].name); - if (irq < 0) + if (irq < 0) { + destroy_workqueue(di->fg_wq); return irq; + } ret = devm_request_threaded_irq(dev, irq, NULL, ab8500_fg_irq[i].isr, @@ -3185,6 +3189,7 @@ static int ab8500_fg_probe(struct platform_device *pdev) if (ret != 0) { dev_err(dev, "failed to request %s IRQ %d: %d\n", ab8500_fg_irq[i].name, irq, ret); + destroy_workqueue(di->fg_wq); return ret; } dev_dbg(dev, "Requested %s IRQ %d: %d\n", @@ -3200,6 +3205,7 @@ static int ab8500_fg_probe(struct platform_device *pdev) ret = ab8500_fg_sysfs_init(di); if (ret) { dev_err(dev, "failed to create sysfs entry\n"); + destroy_workqueue(di->fg_wq); return ret; } @@ -3207,6 +3213,7 @@ static int ab8500_fg_probe(struct platform_device *pdev) if (ret) { dev_err(dev, "failed to create FG psy\n"); ab8500_fg_sysfs_exit(di); + destroy_workqueue(di->fg_wq); return ret; } From 80192eff64eee9b3bc0594a47381937b94b9d65a Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 23 May 2022 18:10:09 +0400 Subject: [PATCH 004/248] power/reset: arm-versatile: Fix refcount leak in versatile_reboot_probe of_find_matching_node_and_match() returns a node pointer with refcount incremented, we should use of_node_put() on it when not need anymore. Add missing of_node_put() to avoid refcount leak. Fixes: 0e545f57b708 ("power: reset: driver for the Versatile syscon reboot") Signed-off-by: Miaoqian Lin Reviewed-by: Linus Walleij Signed-off-by: Sebastian Reichel --- drivers/power/reset/arm-versatile-reboot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/power/reset/arm-versatile-reboot.c b/drivers/power/reset/arm-versatile-reboot.c index 08d0a07b58ef..c7624d7611a7 100644 --- a/drivers/power/reset/arm-versatile-reboot.c +++ b/drivers/power/reset/arm-versatile-reboot.c @@ -146,6 +146,7 @@ static int __init versatile_reboot_probe(void) versatile_reboot_type = (enum versatile_reboot)reboot_id->data; syscon_regmap = syscon_node_to_regmap(np); + of_node_put(np); if (IS_ERR(syscon_regmap)) return PTR_ERR(syscon_regmap); From 093d27bb6f2d1963f927ef59c9a2d37059175426 Mon Sep 17 00:00:00 2001 From: Dorian Rudolph Date: Sat, 14 May 2022 17:23:40 +0200 Subject: [PATCH 005/248] power: supply: core: Fix boundary conditions in interpolation The functions power_supply_temp2resist_simple and power_supply_ocv2cap_simple handle boundary conditions incorrectly. The change was introduced in a4585ba2050f460f749bbaf2b67bd56c41e30283 ("power: supply: core: Use library interpolation"). There are two issues: First, the lines "high = i - 1" and "high = i" in ocv2cap have the wrong order compared to temp2resist. As a consequence, ocv2cap sets high=-1 if ocv>table[0].ocv, which causes an out-of-bounds read. Second, the logic of temp2resist is also not correct. Consider the case table[] = {{20, 100}, {10, 80}, {0, 60}}. For temp=5, we expect a resistance of 70% by interpolation. However, temp2resist sets high=low=2 and returns 60. Cc: stable@vger.kernel.org Signed-off-by: Dorian Rudolph Reviewed-by: Linus Walleij Fixes: a4585ba2050f ("power: supply: core: Use library interpolation") Signed-off-by: Sebastian Reichel --- drivers/power/supply/power_supply_core.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index fad5890c899e..470253c337c7 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -846,17 +846,17 @@ int power_supply_temp2resist_simple(struct power_supply_resistance_temp_table *t { int i, high, low; - /* Break loop at table_len - 1 because that is the highest index */ - for (i = 0; i < table_len - 1; i++) + for (i = 0; i < table_len; i++) if (temp > table[i].temp) break; /* The library function will deal with high == low */ - if ((i == 0) || (i == (table_len - 1))) - high = i; + if (i == 0) + high = low = i; + else if (i == table_len) + high = low = i - 1; else - high = i - 1; - low = i; + high = (low = i) - 1; return fixp_linear_interpolate(table[low].temp, table[low].resistance, @@ -958,17 +958,17 @@ int power_supply_ocv2cap_simple(struct power_supply_battery_ocv_table *table, { int i, high, low; - /* Break loop at table_len - 1 because that is the highest index */ - for (i = 0; i < table_len - 1; i++) + for (i = 0; i < table_len; i++) if (ocv > table[i].ocv) break; /* The library function will deal with high == low */ - if ((i == 0) || (i == (table_len - 1))) - high = i - 1; + if (i == 0) + high = low = i; + else if (i == table_len) + high = low = i - 1; else - high = i; /* i.e. i == 0 */ - low = i; + high = (low = i) - 1; return fixp_linear_interpolate(table[low].ocv, table[low].capacity, From 2881a4ab319918e775ec9c084da3d6cc15ad77ab Mon Sep 17 00:00:00 2001 From: Peter Geis Date: Fri, 10 Jun 2022 09:25:42 -0400 Subject: [PATCH 006/248] arm64: dts: rockchip: Fix Quartz64-A dwc3 otg port behavior The otg_id line on the Quartz64 Model A is not connected to anything. This prevents automatic selection of the dual role usb port. In otg mode it defaults to device mode. Force it to host mode to retain previous behavior. Fixes: bc405bb3eeee ("arm64: dts: rockchip: enable otg/drd operation of usb_host0_xhci in rk356x") Signed-off-by: Peter Geis Link: https://lore.kernel.org/r/20220610132542.159978-1-pgwipeout@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts index 1534e11a9ad1..fa953b736642 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts @@ -687,6 +687,7 @@ }; &usb_host0_xhci { + dr_mode = "host"; status = "okay"; }; From 2d56af33d4df94d2b76446ffc3e3654c42232f4b Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 7 Jun 2022 14:15:36 -0700 Subject: [PATCH 007/248] arm64: dts: rockchip: Assign RK3399 VDU clock rate Before commit 9998943f6dfc ("media: rkvdec: Stop overclocking the decoder"), the rkvdec driver was forcing the VDU clock rate. After that commit, we rely on the default clock rate. That rate works OK on many boards, with the default PLL settings (CPLL is 800MHz, VDU dividers leave it at 400MHz); but some boards change PLL settings. Assign the expected default clock rate explicitly, so that the rate is consistent, regardless of PLL configuration. This was particularly broken on RK3399 Gru Scarlet systems, where the rk3399-gru-scarlet.dtsi assigns PLL_CPLL to 1.6 GHz, and so the VDU clock ends up at 800 MHz (twice the expected rate), and causes video artifacts and other issues. Note: I assign the clock rate in the clock controller instead of the vdec node, because there are multiple nodes that use this clock, and per the clock.yaml specification: Configuring a clock's parent and rate through the device node that consumes the clock can be done only for clocks that have a single user. Specifying conflicting parent or rate configuration in multiple consumer nodes for a shared clock is forbidden. Configuration of common clocks, which affect multiple consumer devices can be similarly specified in the clock provider node. Fixes: 9998943f6dfc ("media: rkvdec: Stop overclocking the decoder") Cc: Signed-off-by: Brian Norris Reviewed-by: Nicolas Dufresne Link: https://lore.kernel.org/r/20220607141535.1.Idafe043ffc94756a69426ec68872db0645c5d6e2@changeid Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi | 4 +++- arch/arm64/boot/dts/rockchip/rk3399.dtsi | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi index 913d845eb51a..1977103a5ef4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi @@ -376,7 +376,8 @@ camera: &i2c7 { <&cru ACLK_VIO>, <&cru ACLK_GIC_PRE>, <&cru PCLK_DDR>, - <&cru ACLK_HDCP>; + <&cru ACLK_HDCP>, + <&cru ACLK_VDU>; assigned-clock-rates = <600000000>, <1600000000>, <1000000000>, @@ -388,6 +389,7 @@ camera: &i2c7 { <400000000>, <200000000>, <200000000>, + <400000000>, <400000000>; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index fbd0346624e6..9d5b0e8c9cca 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -1462,7 +1462,8 @@ <&cru HCLK_PERILP1>, <&cru PCLK_PERILP1>, <&cru ACLK_VIO>, <&cru ACLK_HDCP>, <&cru ACLK_GIC_PRE>, - <&cru PCLK_DDR>; + <&cru PCLK_DDR>, + <&cru ACLK_VDU>; assigned-clock-rates = <594000000>, <800000000>, <1000000000>, @@ -1473,7 +1474,8 @@ <100000000>, <50000000>, <400000000>, <400000000>, <200000000>, - <200000000>; + <200000000>, + <400000000>; }; grf: syscon@ff770000 { From 85ff37e302efdf173cff6d1a310c2f7f38f1d069 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 20 May 2022 19:47:26 +0300 Subject: [PATCH 008/248] gpiolib: cdev: Fix kernel doc for struct line Kernel doc validator is not happy: gpiolib-cdev.c:487: warning: Function parameter or member 'hdesc' not described in 'line' gpiolib-cdev.c:487: warning: Function parameter or member 'raw_level' not described in 'line' gpiolib-cdev.c:487: warning: Function parameter or member 'total_discard_seq' not described in 'line' gpiolib-cdev.c:487: warning: Function parameter or member 'last_seqno' not described in 'line' Describe above mentioned parameters. Fixes: 2068339a6c35 ("gpiolib: cdev: Add hardware timestamp clock type") Signed-off-by: Andy Shevchenko Acked-by: Dipen Patel Acked-by: Bartosz Golaszewski Signed-off-by: Thierry Reding --- drivers/gpio/gpiolib-cdev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index d8dba8c9d2cf..7a952a0307cd 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -421,6 +421,10 @@ out_free_lh: * @work: the worker that implements software debouncing * @sw_debounced: flag indicating if the software debouncer is active * @level: the current debounced physical level of the line + * @hdesc: the Hardware Timestamp Engine (HTE) descriptor + * @raw_level: the line level at the time of event + * @total_discard_seq: the running counter of the discarded events + * @last_seqno: the last sequence number before debounce period expires */ struct line { struct gpio_desc *desc; From f4470dbfb5ff92804650bc71d115c3f150d430f6 Mon Sep 17 00:00:00 2001 From: Liang He Date: Thu, 16 Jun 2022 10:17:13 +0800 Subject: [PATCH 009/248] ARM: rockchip: Add missing of_node_put() in rockchip_suspend_init() In rockchip_suspend_init(), of_find_matching_node_and_match() will return a node pointer with refcount incremented. We should use of_node_put() in fail path or when it is not used anymore. Signed-off-by: Liang He Link: https://lore.kernel.org/r/20220616021713.3973472-1-windhl@126.com Signed-off-by: Heiko Stuebner --- arch/arm/mach-rockchip/pm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-rockchip/pm.c b/arch/arm/mach-rockchip/pm.c index 87389d9456b9..30d781d80fe0 100644 --- a/arch/arm/mach-rockchip/pm.c +++ b/arch/arm/mach-rockchip/pm.c @@ -311,7 +311,7 @@ void __init rockchip_suspend_init(void) &match); if (!match) { pr_err("Failed to find PMU node\n"); - return; + goto out_put; } pm_data = (struct rockchip_pm_data *) match->data; @@ -320,9 +320,12 @@ void __init rockchip_suspend_init(void) if (ret) { pr_err("%s: matches init error %d\n", __func__, ret); - return; + goto out_put; } } suspend_set_ops(pm_data->ops); + +out_put: + of_node_put(np); } From cd16044d7c38d76fe2f9b71a06619e9590e3e401 Mon Sep 17 00:00:00 2001 From: Marcin Wojtas Date: Mon, 20 Jun 2022 14:10:46 +0200 Subject: [PATCH 010/248] serial: 8250: dw: enable using pdata with ACPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit ffd381445eac ("serial: 8250: dw: Move the USR register to pdata") caused NULL-pointer dereference when booting with ACPI by unconditional usage of the recently added pdata. In order to fix that and prevent similar issues in future, hook the default version of this structure in dw8250_acpi_match table. While at it, sort all entries alphabetically. Fixes: ffd381445eac ("serial: 8250: dw: Move the USR register to pdata") Reviewed-by: Ilpo Järvinen Reviewed-by: Andy Shevchenko Signed-off-by: Marcin Wojtas Link: https://lore.kernel.org/r/20220620121046.1307412-1-mw@semihalf.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_dw.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index f57bbd32ef11..b1ab088fbe6e 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -773,18 +773,18 @@ static const struct of_device_id dw8250_of_match[] = { MODULE_DEVICE_TABLE(of, dw8250_of_match); static const struct acpi_device_id dw8250_acpi_match[] = { - { "INT33C4", 0 }, - { "INT33C5", 0 }, - { "INT3434", 0 }, - { "INT3435", 0 }, - { "80860F0A", 0 }, - { "8086228A", 0 }, - { "APMC0D08", 0}, - { "AMD0020", 0 }, - { "AMDI0020", 0 }, - { "AMDI0022", 0 }, - { "BRCM2032", 0 }, - { "HISI0031", 0 }, + { "80860F0A", (kernel_ulong_t)&dw8250_dw_apb }, + { "8086228A", (kernel_ulong_t)&dw8250_dw_apb }, + { "AMD0020", (kernel_ulong_t)&dw8250_dw_apb }, + { "AMDI0020", (kernel_ulong_t)&dw8250_dw_apb }, + { "AMDI0022", (kernel_ulong_t)&dw8250_dw_apb }, + { "APMC0D08", (kernel_ulong_t)&dw8250_dw_apb}, + { "BRCM2032", (kernel_ulong_t)&dw8250_dw_apb }, + { "HISI0031", (kernel_ulong_t)&dw8250_dw_apb }, + { "INT33C4", (kernel_ulong_t)&dw8250_dw_apb }, + { "INT33C5", (kernel_ulong_t)&dw8250_dw_apb }, + { "INT3434", (kernel_ulong_t)&dw8250_dw_apb }, + { "INT3435", (kernel_ulong_t)&dw8250_dw_apb }, { }, }; MODULE_DEVICE_TABLE(acpi, dw8250_acpi_match); From f7e35e4bf1e8dc2c8cbd5e0955dc1bd58558dae0 Mon Sep 17 00:00:00 2001 From: Chanho Park Date: Mon, 27 Jun 2022 15:51:13 +0900 Subject: [PATCH 011/248] tty: serial: samsung_tty: set dma burst_size to 1 The src_maxburst and dst_maxburst have been changed to 1 but the settings of the UCON register aren't changed yet. They should be changed as well according to the dmaengine slave config. Fixes: aa2f80e752c7 ("serial: samsung: fix maxburst parameter for DMA transactions") Cc: stable Cc: Marek Szyprowski Reviewed-by: Krzysztof Kozlowski Signed-off-by: Chanho Park Link: https://lore.kernel.org/r/20220627065113.139520-1-chanho61.park@samsung.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/samsung_tty.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/tty/serial/samsung_tty.c b/drivers/tty/serial/samsung_tty.c index d5ca904def34..1afe47b62ad5 100644 --- a/drivers/tty/serial/samsung_tty.c +++ b/drivers/tty/serial/samsung_tty.c @@ -377,8 +377,7 @@ static void enable_tx_dma(struct s3c24xx_uart_port *ourport) /* Enable tx dma mode */ ucon = rd_regl(port, S3C2410_UCON); ucon &= ~(S3C64XX_UCON_TXBURST_MASK | S3C64XX_UCON_TXMODE_MASK); - ucon |= (dma_get_cache_alignment() >= 16) ? - S3C64XX_UCON_TXBURST_16 : S3C64XX_UCON_TXBURST_1; + ucon |= S3C64XX_UCON_TXBURST_1; ucon |= S3C64XX_UCON_TXMODE_DMA; wr_regl(port, S3C2410_UCON, ucon); @@ -674,7 +673,7 @@ static void enable_rx_dma(struct s3c24xx_uart_port *ourport) S3C64XX_UCON_DMASUS_EN | S3C64XX_UCON_TIMEOUT_EN | S3C64XX_UCON_RXMODE_MASK); - ucon |= S3C64XX_UCON_RXBURST_16 | + ucon |= S3C64XX_UCON_RXBURST_1 | 0xf << S3C64XX_UCON_TIMEOUT_SHIFT | S3C64XX_UCON_EMPTYINT_EN | S3C64XX_UCON_TIMEOUT_EN | From 211565b100993c90b53bf40851eacaefc830cfe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 14 Jun 2022 10:56:37 +0300 Subject: [PATCH 012/248] serial: pl011: UPSTAT_AUTORTS requires .throttle/unthrottle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver must provide throttle and unthrottle in uart_ops when it sets UPSTAT_AUTORTS. Add them using existing stop_rx & enable_interrupts functions. Fixes: 2a76fa283098 (serial: pl011: Adopt generic flag to store auto RTS status) Cc: stable Cc: Lukas Wunner Reported-by: Nuno Gonçalves Tested-by: Nuno Gonçalves Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220614075637.8558-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/amba-pl011.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 97ef41cb2721..16a21422ddce 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -1367,6 +1367,15 @@ static void pl011_stop_rx(struct uart_port *port) pl011_dma_rx_stop(uap); } +static void pl011_throttle_rx(struct uart_port *port) +{ + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + pl011_stop_rx(port); + spin_unlock_irqrestore(&port->lock, flags); +} + static void pl011_enable_ms(struct uart_port *port) { struct uart_amba_port *uap = @@ -1788,9 +1797,10 @@ static int pl011_allocate_irq(struct uart_amba_port *uap) */ static void pl011_enable_interrupts(struct uart_amba_port *uap) { + unsigned long flags; unsigned int i; - spin_lock_irq(&uap->port.lock); + spin_lock_irqsave(&uap->port.lock, flags); /* Clear out any spuriously appearing RX interrupts */ pl011_write(UART011_RTIS | UART011_RXIS, uap, REG_ICR); @@ -1812,7 +1822,14 @@ static void pl011_enable_interrupts(struct uart_amba_port *uap) if (!pl011_dma_rx_running(uap)) uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); - spin_unlock_irq(&uap->port.lock); + spin_unlock_irqrestore(&uap->port.lock, flags); +} + +static void pl011_unthrottle_rx(struct uart_port *port) +{ + struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); + + pl011_enable_interrupts(uap); } static int pl011_startup(struct uart_port *port) @@ -2225,6 +2242,8 @@ static const struct uart_ops amba_pl011_pops = { .stop_tx = pl011_stop_tx, .start_tx = pl011_start_tx, .stop_rx = pl011_stop_rx, + .throttle = pl011_throttle_rx, + .unthrottle = pl011_unthrottle_rx, .enable_ms = pl011_enable_ms, .break_ctl = pl011_break_ctl, .startup = pl011_startup, From f8d6e9d3ca5c68e24dd485132a93d49abd444eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 15 Jun 2022 12:06:49 +0300 Subject: [PATCH 013/248] serial: 8250: Fix __stop_tx() & DMA Tx restart races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e8ffbb71f783 ("serial: 8250: use THRE & __stop_tx also with DMA") changed __dma_tx_complete() to enable THRI that is cleared in __stop_tx() once THRE is asserted as UART runs out bits to transmit. It is possible, however, that more data arrives in between in which case serial8250_tx_dma() resumes Tx. THRI is not supposed to be on during DMA Tx because DMA is based on completion handler, therefore THRI must be cleared unconditionally in serial8250_tx_dma(). When Tx is about to start, another race window exists with serial8250_handle_irq() leading to a call into __stop_tx() while the Tx has already been resumed: __tx_complete(): -> spin_lock(port->lock) -> dma->tx_running = 0 -> serial8250_set_THRI() -> spin_unlock(port->lock) uart_start(): serial8250_handle_irq(): -> spin_lock(port->lock) -> serial8250_tx_dma(): -> dma->tx_running = 1 -> spin_unlock(port->lock) -> spin_lock(port->lock) -> __stop_tx() Close this race by checking !dma->tx_running before calling into __stop_tx(). Fixes: e8ffbb71f783 ("serial: 8250: use THRE & __stop_tx also with DMA") Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220615090651.15340-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_dma.c | 6 +++--- drivers/tty/serial/8250/8250_port.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/tty/serial/8250/8250_dma.c b/drivers/tty/serial/8250/8250_dma.c index 7133fceed35e..a8dba4a0a8fb 100644 --- a/drivers/tty/serial/8250/8250_dma.c +++ b/drivers/tty/serial/8250/8250_dma.c @@ -106,10 +106,10 @@ int serial8250_tx_dma(struct uart_8250_port *p) UART_XMIT_SIZE, DMA_TO_DEVICE); dma_async_issue_pending(dma->txchan); - if (dma->tx_err) { + serial8250_clear_THRI(p); + if (dma->tx_err) dma->tx_err = 0; - serial8250_clear_THRI(p); - } + return 0; err: dma->tx_err = 1; diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 8f32fe9e149e..b9cdc5552b0d 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1949,7 +1949,7 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir) if ((status & UART_LSR_THRE) && (up->ier & UART_IER_THRI)) { if (!up->dma || up->dma->tx_err) serial8250_tx_chars(up); - else + else if (!up->dma->tx_running) __stop_tx(up); } From ec5ad331680c96ef3dd30dc297b206988023b9e1 Mon Sep 17 00:00:00 2001 From: Max Staudt Date: Sat, 18 Jun 2022 20:01:34 +0200 Subject: [PATCH 014/248] tty: Add N_CAN327 line discipline ID for ELM327 based CAN driver The actual driver will be added via the CAN tree. Acked-by: Marc Kleine-Budde Signed-off-by: Max Staudt Link: https://lore.kernel.org/r/20220618180134.9890-1-max@enpas.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/tty.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h index 9d0f06bfbac3..68aeae2addec 100644 --- a/include/uapi/linux/tty.h +++ b/include/uapi/linux/tty.h @@ -38,8 +38,9 @@ #define N_NULL 27 /* Null ldisc used for error handling */ #define N_MCTP 28 /* MCTP-over-serial */ #define N_DEVELOPMENT 29 /* Manual out-of-tree testing */ +#define N_CAN327 30 /* ELM327 based OBD-II interfaces */ /* Always the newest line discipline + 1 */ -#define NR_LDISCS 30 +#define NR_LDISCS 31 #endif /* _UAPI_LINUX_TTY_H */ From 039d4ed3428cf9c2052048d177880ebd02104764 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 27 Jun 2022 15:11:41 -0700 Subject: [PATCH 015/248] Input: usbtouchscreen - add driver_info sanity check Add a sanity check on the device id-table driver_info field to make sure we never access a type structure (and function pointers) outside of the device info array (e.g. if someone fails to ifdef a device-id entry). Note that this also suppresses a compiler warning with -Warray-bounds (gcc-11.3.0) when compile-testing the driver without enabling any of the device type Kconfig options: drivers/input/touchscreen/usbtouchscreen.c: In function 'usbtouch_probe': drivers/input/touchscreen/usbtouchscreen.c:1668:16:warning: array subscript is outside array bounds of 'struct usbtouch_device_info[0]' [-Warray-bounds] 1668 | type = &usbtouch_dev_info[id->driver_info]; Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20220623062446.16944-1-johan@kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/usbtouchscreen.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/touchscreen/usbtouchscreen.c b/drivers/input/touchscreen/usbtouchscreen.c index 43c521f50c85..3dda6eaabdab 100644 --- a/drivers/input/touchscreen/usbtouchscreen.c +++ b/drivers/input/touchscreen/usbtouchscreen.c @@ -1654,6 +1654,9 @@ static int usbtouch_probe(struct usb_interface *intf, if (id->driver_info == DEVTYPE_IGNORE) return -ENODEV; + if (id->driver_info >= ARRAY_SIZE(usbtouch_dev_info)) + return -ENODEV; + endpoint = usbtouch_get_input_endpoint(intf->cur_altsetting); if (!endpoint) return -ENXIO; From a5bdaae7ae596686b83a8a5038ee6d9afeb24531 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 1 Jun 2022 10:22:39 +0200 Subject: [PATCH 016/248] MAINTAINERS: rectify entry for SYNOPSYS AXS10x RESET CONTROLLER DRIVER Commit 820f722c05dd ("dt-bindings: reset: snps,axs10x-reset: Convert to yaml") converts snps,axs10x-reset.txt to yaml, but misses to adjust its reference in MAINTAINERS. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a broken reference. Repair this file reference in SYNOPSYS AXS10x RESET CONTROLLER DRIVER. Signed-off-by: Lukas Bulwahn Signed-off-by: Philipp Zabel Link: https://lore.kernel.org/r/20220601082239.12009-1-lukas.bulwahn@gmail.com --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a6d3bd9d2a8d..54ce63e26363 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19220,7 +19220,7 @@ F: arch/arc/plat-axs10x SYNOPSYS AXS10x RESET CONTROLLER DRIVER M: Eugeniy Paltsev S: Supported -F: Documentation/devicetree/bindings/reset/snps,axs10x-reset.txt +F: Documentation/devicetree/bindings/reset/snps,axs10x-reset.yaml F: drivers/reset/reset-axs10x.c SYNOPSYS CREG GPIO DRIVER From a57f68ddc8865d59a19783080cc52fb4a11dc209 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 24 Jun 2022 17:18:45 +0300 Subject: [PATCH 017/248] reset: Fix devm bulk optional exclusive control getter Most likely due to copy-paste mistake the device managed version of the denoted reset control getter has been implemented with invalid semantic, which can be immediately spotted by having "WARN_ON(shared && acquired)" warning in the system log as soon as the method is called. Anyway let's fix it by altering the boolean arguments passed to the __devm_reset_control_bulk_get() method from - shared = true, optional = false, acquired = true to + shared = false, optional = true, acquired = true That's what they were supposed to be in the first place (see the non-devm version of the same method: reset_control_bulk_get_optional_exclusive()). Fixes: 48d71395896d ("reset: Add reset_control_bulk API") Signed-off-by: Serge Semin Reviewed-by: Dmitry Osipenko Signed-off-by: Philipp Zabel Link: https://lore.kernel.org/r/20220624141853.7417-2-Sergey.Semin@baikalelectronics.ru --- include/linux/reset.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/reset.h b/include/linux/reset.h index 8a21b5756c3e..514ddf003efc 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -731,7 +731,7 @@ static inline int __must_check devm_reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, true, false, true); + return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, true); } /** From 8988ba7dec43aabd43adb1214b922b8873e9da88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 22 Jun 2022 18:16:16 +0200 Subject: [PATCH 018/248] spi: aspeed: Add dev_dbg() to dump the spi-mem direct mapping descriptor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default value of the control register is set using the direct mapping information passed to the ->dirmap_create() handler. Dump the mapping range and the SPI memory operation characteristics to analyze how the register value has been computed. spi-aspeed-smc 1e630000.spi: CE0 read dirmap [ 0x00000000 - 0x04000000 ] OP 0x6c mode:1.1.1.4 naddr:0x4 ndummies:0x1 ... spi-aspeed-smc 1e630000.spi: CE0 write dirmap [ 0x00000000 - 0x04000000 ] OP 0x12 mode:1.1.0.1 naddr:0x4 ndummies:0x0 Reviewed-by: Paul Menzel Signed-off-by: Cédric Le Goater Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20220622161617.3719096-2-clg@kaod.org Signed-off-by: Mark Brown --- drivers/spi/spi-aspeed-smc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/spi/spi-aspeed-smc.c b/drivers/spi/spi-aspeed-smc.c index 496f3e1e9079..ac64be289e59 100644 --- a/drivers/spi/spi-aspeed-smc.c +++ b/drivers/spi/spi-aspeed-smc.c @@ -558,6 +558,14 @@ static int aspeed_spi_dirmap_create(struct spi_mem_dirmap_desc *desc) u32 ctl_val; int ret = 0; + dev_dbg(aspi->dev, + "CE%d %s dirmap [ 0x%.8llx - 0x%.8llx ] OP %#x mode:%d.%d.%d.%d naddr:%#x ndummies:%#x\n", + chip->cs, op->data.dir == SPI_MEM_DATA_IN ? "read" : "write", + desc->info.offset, desc->info.offset + desc->info.length, + op->cmd.opcode, op->cmd.buswidth, op->addr.buswidth, + op->dummy.buswidth, op->data.buswidth, + op->addr.nbytes, op->dummy.nbytes); + chip->clk_freq = desc->mem->spi->max_speed_hz; /* Only for reads */ From 30554a1f0fd6a5d2e2413bdc05389995d5611736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 22 Jun 2022 18:16:17 +0200 Subject: [PATCH 019/248] spi: aspeed: Fix division by zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using the normal read operation for data transfers, the dummy bus width is zero. In that case, they are no dummy bytes to transfer and setting the dummy field in the controller register becomes useless. Issue was found on a custom "Bifrost" board based on the AST2500 SoC and using a MX25L51245GMI-08G SPI Flash. Reported-by: Ian Woloschin Reviewed-by: Pratyush Yadav Tested-by: Ian Woloschin Fixes: 9da06d7bdec7dad80 ("spi: aspeed: Add support for direct mapping") Signed-off-by: Cédric Le Goater Link: https://lore.kernel.org/r/20220622161617.3719096-3-clg@kaod.org Signed-off-by: Mark Brown --- drivers/spi/spi-aspeed-smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-aspeed-smc.c b/drivers/spi/spi-aspeed-smc.c index ac64be289e59..3e891bf22470 100644 --- a/drivers/spi/spi-aspeed-smc.c +++ b/drivers/spi/spi-aspeed-smc.c @@ -582,9 +582,11 @@ static int aspeed_spi_dirmap_create(struct spi_mem_dirmap_desc *desc) ctl_val = readl(chip->ctl) & ~CTRL_IO_CMD_MASK; ctl_val |= aspeed_spi_get_io_mode(op) | op->cmd.opcode << CTRL_COMMAND_SHIFT | - CTRL_IO_DUMMY_SET(op->dummy.nbytes / op->dummy.buswidth) | CTRL_IO_MODE_READ; + if (op->dummy.nbytes) + ctl_val |= CTRL_IO_DUMMY_SET(op->dummy.nbytes / op->dummy.buswidth); + /* Tune 4BYTE address mode */ if (op->addr.nbytes) { u32 addr_mode = readl(aspi->regs + CE_CTRL_REG); From 7441b273388b9a59d8387a03ffbbca9d5af6348c Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Mon, 27 Jun 2022 18:41:19 -0700 Subject: [PATCH 020/248] usb: dwc3: gadget: Fix event pending check The DWC3_EVENT_PENDING flag is used to protect against invalid call to top-half interrupt handler, which can occur when there's a delay in software detection of the interrupt line deassertion. However, the clearing of this flag was done prior to unmasking the interrupt line, creating opportunity where the top-half handler can come. This breaks the serialization and creates a race between the top-half and bottom-half handler, resulting in losing synchronization between the controller and the driver when processing events. To fix this, make sure the clearing of the DWC3_EVENT_PENDING is done at the end of the bottom-half handler. Fixes: d325a1de49d6 ("usb: dwc3: gadget: Prevent losing events in event cache") Cc: stable@vger.kernel.org Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/8670aaf1cf52e7d1e6df2a827af2d77263b93b75.1656380429.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 8716bece1072..0d89dfa6eef5 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -4249,7 +4249,6 @@ static irqreturn_t dwc3_process_event_buf(struct dwc3_event_buffer *evt) } evt->count = 0; - evt->flags &= ~DWC3_EVENT_PENDING; ret = IRQ_HANDLED; /* Unmask interrupt */ @@ -4261,6 +4260,9 @@ static irqreturn_t dwc3_process_event_buf(struct dwc3_event_buffer *evt) dwc3_writel(dwc->regs, DWC3_DEV_IMOD(0), dwc->imod_interval); } + /* Keep the clearing of DWC3_EVENT_PENDING at the end */ + evt->flags &= ~DWC3_EVENT_PENDING; + return ret; } From 5812175389e258141c5e9f8eadc1ed226f67bc11 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 29 Jun 2022 17:46:35 +0800 Subject: [PATCH 021/248] usb: dwc3-am62: remove unnecesary clk_put() The clk get by devm_clk_get() will be released in devres_release_all(), so there is no need explicitly call clk_put(), or it will cause UAF. Fixes: e8784c0aec03 ("drivers: usb: dwc3: Add AM62 USB wrapper driver") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20220629094635.3116961-1-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-am62.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-am62.c b/drivers/usb/dwc3/dwc3-am62.c index fea7aca35dc8..173cf3579c55 100644 --- a/drivers/usb/dwc3/dwc3-am62.c +++ b/drivers/usb/dwc3/dwc3-am62.c @@ -195,8 +195,7 @@ static int dwc3_ti_probe(struct platform_device *pdev) if (i == ARRAY_SIZE(dwc3_ti_rate_table)) { dev_err(dev, "unsupported usb2_refclk rate: %lu KHz\n", rate); - ret = -EINVAL; - goto err_clk_disable; + return -EINVAL; } data->rate_code = i; @@ -204,7 +203,7 @@ static int dwc3_ti_probe(struct platform_device *pdev) /* Read the syscon property and set the rate code */ ret = phy_syscon_pll_refclk(data); if (ret) - goto err_clk_disable; + return ret; /* VBUS divider select */ data->vbus_divider = device_property_read_bool(dev, "ti,vbus-divider"); @@ -245,8 +244,6 @@ err_pm_disable: clk_disable_unprepare(data->usb2_refclk); pm_runtime_disable(dev); pm_runtime_set_suspended(dev); -err_clk_disable: - clk_put(data->usb2_refclk); return ret; } @@ -276,7 +273,6 @@ static int dwc3_ti_remove(struct platform_device *pdev) pm_runtime_disable(dev); pm_runtime_set_suspended(dev); - clk_put(data->usb2_refclk); platform_set_drvdata(pdev, NULL); return 0; } From 5c5f44e36217de5ead789ff25da71c31c2331c96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 27 Jun 2022 18:07:52 +0300 Subject: [PATCH 022/248] serial: stm32: Clear prev values before setting RTS delays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code lacks clearing of previous DEAT/DEDT values. Thus, changing values on the fly results in garbage delays tending towards the maximum value as more and more bits are ORed together. (Leaving RS485 mode would have cleared the old values though). Fixes: 1bcda09d2910 ("serial: stm32: add support for RS485 hardware control mode") Cc: stable Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220627150753.34510-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/stm32-usart.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index b7b44f4050d4..0973b03eeeaa 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -72,6 +72,8 @@ static void stm32_usart_config_reg_rs485(u32 *cr1, u32 *cr3, u32 delay_ADE, *cr3 |= USART_CR3_DEM; over8 = *cr1 & USART_CR1_OVER8; + *cr1 &= ~(USART_CR1_DEDT_MASK | USART_CR1_DEAT_MASK); + if (over8) rs485_deat_dedt = delay_ADE * baud * 8; else From 6e690d54cfa802f939cefbd2fa2c91bd0b8bd1b6 Mon Sep 17 00:00:00 2001 From: Yi Yang Date: Tue, 28 Jun 2022 16:35:15 +0800 Subject: [PATCH 023/248] serial: 8250: fix return error code in serial8250_request_std_resource() If port->mapbase = NULL in serial8250_request_std_resource() , it need return a error code instead of 0. If uart_set_info() fail to request new regions by serial8250_request_std_resource() but the return value of serial8250_request_std_resource() is 0, The system incorrectly considers that the resource application is successful and does not attempt to restore the old setting. A null pointer reference is triggered when the port resource is later invoked. Signed-off-by: Yi Yang Cc: stable Link: https://lore.kernel.org/r/20220628083515.64138-1-yiyang13@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index b9cdc5552b0d..3c36a06a20b0 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -2975,8 +2975,10 @@ static int serial8250_request_std_resource(struct uart_8250_port *up) case UPIO_MEM32BE: case UPIO_MEM16: case UPIO_MEM: - if (!port->mapbase) + if (!port->mapbase) { + ret = -EINVAL; break; + } if (!request_mem_region(port->mapbase, size, "serial")) { ret = -EBUSY; From f9b11229b79c0fb2100b5bb4628a101b1d37fbf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 29 Jun 2022 12:48:41 +0300 Subject: [PATCH 024/248] serial: 8250: Fix PM usage_count for console handover MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When console is enabled, univ8250_console_setup() calls serial8250_console_setup() before .dev is set to uart_port. Therefore, it will not call pm_runtime_get_sync(). Later, when the actual driver is going to take over univ8250_console_exit() is called. As .dev is already set, serial8250_console_exit() makes pm_runtime_put_sync() call with usage count being zero triggering PM usage count warning (extra debug for univ8250_console_setup(), univ8250_console_exit(), and serial8250_register_ports()): [ 0.068987] univ8250_console_setup ttyS0 nodev [ 0.499670] printk: console [ttyS0] enabled [ 0.717955] printk: console [ttyS0] printing thread started [ 1.960163] serial8250_register_ports assigned dev for ttyS0 [ 1.976830] printk: console [ttyS0] disabled [ 1.976888] printk: console [ttyS0] printing thread stopped [ 1.977073] univ8250_console_exit ttyS0 usage:0 [ 1.977075] serial8250 serial8250: Runtime PM usage count underflow! [ 1.977429] dw-apb-uart.6: ttyS0 at MMIO 0x4010006000 (irq = 33, base_baud = 115200) is a 16550A [ 1.977812] univ8250_console_setup ttyS0 usage:2 [ 1.978167] printk: console [ttyS0] printing thread started [ 1.978203] printk: console [ttyS0] enabled To fix the issue, call pm_runtime_get_sync() in serial8250_register_ports() as soon as .dev is set for an uart_port if it has console enabled. This problem became apparent only recently because 82586a721595 ("PM: runtime: Avoid device usage count underflows") added the warning printout. I confirmed this problem also occurs with v5.18 (w/o the warning printout, obviously). Fixes: bedb404e91bb ("serial: 8250_port: Don't use power management for kernel console") Cc: stable Tested-by: Tony Lindgren Reviewed-by: Andy Shevchenko Reviewed-by: Tony Lindgren Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/b4f428e9-491f-daf2-2232-819928dc276e@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_core.c | 4 ++++ drivers/tty/serial/serial_core.c | 5 ----- include/linux/serial_core.h | 5 +++++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index cfbd2de0ca6e..3f56dbc9432b 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -559,6 +560,9 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) up->port.dev = dev; + if (uart_console_enabled(&up->port)) + pm_runtime_get_sync(up->port.dev); + serial8250_apply_quirks(up); uart_add_one_port(drv, &up->port); } diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 338ebadfd44b..3dc926d6c00a 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -1941,11 +1941,6 @@ static int uart_proc_show(struct seq_file *m, void *v) } #endif -static inline bool uart_console_enabled(struct uart_port *port) -{ - return uart_console(port) && (port->cons->flags & CON_ENABLED); -} - static void uart_port_spin_lock_init(struct uart_port *port) { spin_lock_init(&port->lock); diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 657a0fc68a3f..fde258b3decd 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -390,6 +390,11 @@ static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED; static inline int setup_earlycon(char *buf) { return 0; } #endif +static inline bool uart_console_enabled(struct uart_port *port) +{ + return uart_console(port) && (port->cons->flags & CON_ENABLED); +} + struct uart_port *uart_get_console(struct uart_port *ports, int nr, struct console *c); int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr, From 4f532c1e25319e42996ec18a1f473fd50c8e575d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 28 Jun 2022 12:09:22 +0200 Subject: [PATCH 025/248] serial: mvebu-uart: correctly report configured baudrate value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Functions tty_termios_encode_baud_rate() and uart_update_timeout() should be called with the baudrate value which was set to hardware. Linux then report exact values via ioctl(TCGETS2) to userspace. Change mvebu_uart_baud_rate_set() function to return baudrate value which was set to hardware and propagate this value to above mentioned functions. With this change userspace would see precise value in termios c_ospeed field. Fixes: 68a0db1d7da2 ("serial: mvebu-uart: add function to change baudrate") Cc: stable Reviewed-by: Ilpo Järvinen Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20220628100922.10717-1-pali@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/mvebu-uart.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/tty/serial/mvebu-uart.c b/drivers/tty/serial/mvebu-uart.c index 0429c2a54290..93489fe334d0 100644 --- a/drivers/tty/serial/mvebu-uart.c +++ b/drivers/tty/serial/mvebu-uart.c @@ -470,14 +470,14 @@ static void mvebu_uart_shutdown(struct uart_port *port) } } -static int mvebu_uart_baud_rate_set(struct uart_port *port, unsigned int baud) +static unsigned int mvebu_uart_baud_rate_set(struct uart_port *port, unsigned int baud) { unsigned int d_divisor, m_divisor; unsigned long flags; u32 brdv, osamp; if (!port->uartclk) - return -EOPNOTSUPP; + return 0; /* * The baudrate is derived from the UART clock thanks to divisors: @@ -548,7 +548,7 @@ static int mvebu_uart_baud_rate_set(struct uart_port *port, unsigned int baud) (m_divisor << 16) | (m_divisor << 24); writel(osamp, port->membase + UART_OSAMP); - return 0; + return DIV_ROUND_CLOSEST(port->uartclk, d_divisor * m_divisor); } static void mvebu_uart_set_termios(struct uart_port *port, @@ -587,15 +587,11 @@ static void mvebu_uart_set_termios(struct uart_port *port, max_baud = port->uartclk / 80; baud = uart_get_baud_rate(port, termios, old, min_baud, max_baud); - if (mvebu_uart_baud_rate_set(port, baud)) { - /* No clock available, baudrate cannot be changed */ - if (old) - baud = uart_get_baud_rate(port, old, NULL, - min_baud, max_baud); - } else { - tty_termios_encode_baud_rate(termios, baud, baud); - uart_update_timeout(port, termios->c_cflag, baud); - } + baud = mvebu_uart_baud_rate_set(port, baud); + + /* In case baudrate cannot be changed, report previous old value */ + if (baud == 0 && old) + baud = tty_termios_baud_rate(old); /* Only the following flag changes are supported */ if (old) { @@ -606,6 +602,11 @@ static void mvebu_uart_set_termios(struct uart_port *port, termios->c_cflag |= CS8; } + if (baud != 0) { + tty_termios_encode_baud_rate(termios, baud, baud); + uart_update_timeout(port, termios->c_cflag, baud); + } + spin_unlock_irqrestore(&port->lock, flags); } From 39cdb68c64d84e71a4a717000b6e5de208ee60cc Mon Sep 17 00:00:00 2001 From: Yangxi Xiang Date: Tue, 28 Jun 2022 17:33:22 +0800 Subject: [PATCH 026/248] vt: fix memory overlapping when deleting chars in the buffer A memory overlapping copy occurs when deleting a long line. This memory overlapping copy can cause data corruption when scr_memcpyw is optimized to memcpy because memcpy does not ensure its behavior if the destination buffer overlaps with the source buffer. The line buffer is not always broken, because the memcpy utilizes the hardware acceleration, whose result is not deterministic. Fix this problem by using replacing the scr_memcpyw with scr_memmovew. Fixes: 81732c3b2fed ("tty vt: Fix line garbage in virtual console on command line edition") Cc: stable Signed-off-by: Yangxi Xiang Link: https://lore.kernel.org/r/20220628093322.5688-1-xyangxi5@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index f8c87c4d7399..dfc1f4b445f3 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -855,7 +855,7 @@ static void delete_char(struct vc_data *vc, unsigned int nr) unsigned short *p = (unsigned short *) vc->vc_pos; vc_uniscr_delete(vc, nr); - scr_memcpyw(p, p + nr, (vc->vc_cols - vc->state.x - nr) * 2); + scr_memmovew(p, p + nr, (vc->vc_cols - vc->state.x - nr) * 2); scr_memsetw(p + vc->vc_cols - vc->state.x - nr, vc->vc_video_erase_char, nr * 2); vc->vc_need_wrap = 0; From b941e487152e0909ef43faacae6eeee266d9b378 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Thu, 30 Jun 2022 09:39:09 +0100 Subject: [PATCH 027/248] serial: 8250: dw: Fix the macro RZN1_UART_xDMACR_8_WORD_BURST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As per RZ/N1 peripheral user manual(r01uh0752ej0100-rzn1-peripheral.pdf) rev 1.0.0 Mar,2019, the value for 8_WORD_BURST is 4(b2,b1=2’b10). This patch fixes the macro as per the user manual. Fixes: aa63d786cea2 ("serial: 8250: dw: Add support for DMA flow controlling devices") Reviewed-by: Phil Edworthy Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20220630083909.4294-1-biju.das.jz@bp.renesas.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_dw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index b1ab088fbe6e..bb6aca07ab56 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -47,7 +47,7 @@ #define RZN1_UART_xDMACR_DMA_EN BIT(0) #define RZN1_UART_xDMACR_1_WORD_BURST (0 << 1) #define RZN1_UART_xDMACR_4_WORD_BURST (1 << 1) -#define RZN1_UART_xDMACR_8_WORD_BURST (3 << 1) +#define RZN1_UART_xDMACR_8_WORD_BURST (2 << 1) #define RZN1_UART_xDMACR_BLK_SZ(x) ((x) << 3) /* Quirks */ From 03110b46c99bb0c712f46bec660b1c3f674ce100 Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Mon, 6 Jun 2022 18:04:21 +0200 Subject: [PATCH 028/248] ARM: dts: qcom: msm8974: re-add missing pinctrl As part of a recent cleanup commit, the pinctrl for a few uart and i2c nodes was removed. Adjust the names and/or add it back and assign it to the uart and i2c nodes. Fixes: 1dfe967ec7cf ("ARM: dts: qcom-msm8974*: Consolidate I2C/UART/SDHCI") Signed-off-by: Luca Weiss Reviewed-by: Konrad Dybcio Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220606160421.1641778-1-luca@z3ntu.xyz --- arch/arm/boot/dts/qcom-msm8974.dtsi | 30 +++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/qcom-msm8974.dtsi b/arch/arm/boot/dts/qcom-msm8974.dtsi index 814ad0b46232..c3b8a6d63027 100644 --- a/arch/arm/boot/dts/qcom-msm8974.dtsi +++ b/arch/arm/boot/dts/qcom-msm8974.dtsi @@ -506,6 +506,8 @@ interrupts = ; clocks = <&gcc GCC_BLSP1_UART2_APPS_CLK>, <&gcc GCC_BLSP1_AHB_CLK>; clock-names = "core", "iface"; + pinctrl-names = "default"; + pinctrl-0 = <&blsp1_uart2_default>; status = "disabled"; }; @@ -581,6 +583,9 @@ interrupts = ; clocks = <&gcc GCC_BLSP2_UART1_APPS_CLK>, <&gcc GCC_BLSP2_AHB_CLK>; clock-names = "core", "iface"; + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&blsp2_uart1_default>; + pinctrl-1 = <&blsp2_uart1_sleep>; status = "disabled"; }; @@ -599,6 +604,8 @@ interrupts = ; clocks = <&gcc GCC_BLSP2_UART4_APPS_CLK>, <&gcc GCC_BLSP2_AHB_CLK>; clock-names = "core", "iface"; + pinctrl-names = "default"; + pinctrl-0 = <&blsp2_uart4_default>; status = "disabled"; }; @@ -639,6 +646,9 @@ interrupts = <0 106 IRQ_TYPE_LEVEL_HIGH>; clocks = <&gcc GCC_BLSP2_QUP6_I2C_APPS_CLK>, <&gcc GCC_BLSP2_AHB_CLK>; clock-names = "core", "iface"; + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&blsp2_i2c6_default>; + pinctrl-1 = <&blsp2_i2c6_sleep>; #address-cells = <1>; #size-cells = <0>; }; @@ -1256,7 +1266,7 @@ }; }; - blsp1_uart2_active: blsp1-uart2-active { + blsp1_uart2_default: blsp1-uart2-default { rx { pins = "gpio5"; function = "blsp_uart2"; @@ -1272,7 +1282,7 @@ }; }; - blsp2_uart1_active: blsp2-uart1-active { + blsp2_uart1_default: blsp2-uart1-default { tx-rts { pins = "gpio41", "gpio44"; function = "blsp_uart7"; @@ -1295,7 +1305,7 @@ bias-pull-down; }; - blsp2_uart4_active: blsp2-uart4-active { + blsp2_uart4_default: blsp2-uart4-default { tx-rts { pins = "gpio53", "gpio56"; function = "blsp_uart10"; @@ -1406,7 +1416,19 @@ bias-pull-up; }; - /* BLSP2_I2C6 info is missing - nobody uses it though? */ + blsp2_i2c6_default: blsp2-i2c6-default { + pins = "gpio87", "gpio88"; + function = "blsp_i2c12"; + drive-strength = <2>; + bias-disable; + }; + + blsp2_i2c6_sleep: blsp2-i2c6-sleep { + pins = "gpio87", "gpio88"; + function = "blsp_i2c12"; + drive-strength = <2>; + bias-pull-up; + }; spi8_default: spi8_default { mosi { From 79471f29ec4870bc02b4fea844e86669a8a4f2a5 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Fri, 1 Jul 2022 22:29:21 -0500 Subject: [PATCH 029/248] dt-bindings: display: sun4i: Fix D1 pipeline count When adding the bindings for the D1 display engine, I missed the condition for the number of pipelines. D1 has two mixers, so it will have two pipeline references. Fixes: ae5a5d26c15c ("dt-bindings: display: Add D1 display engine compatibles") Signed-off-by: Samuel Holland Reviewed-by: Jernej Skrabec Signed-off-by: Jernej Skrabec Link: https://lore.kernel.org/r/20220702032921.22433-1-samuel@sholland.org --- .../bindings/display/allwinner,sun4i-a10-display-engine.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-engine.yaml b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-engine.yaml index c388ae5da1e4..c9c346e6228e 100644 --- a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-engine.yaml +++ b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-engine.yaml @@ -94,6 +94,7 @@ if: - allwinner,sun8i-a83t-display-engine - allwinner,sun8i-r40-display-engine - allwinner,sun9i-a80-display-engine + - allwinner,sun20i-d1-display-engine - allwinner,sun50i-a64-display-engine then: From 3c12e9da3098a30fc82dea01768d355c28e3692d Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 28 Apr 2022 14:16:59 -0400 Subject: [PATCH 030/248] arm64: dts: ls1028a: Update SFP node to include clock The clocks property is now mandatory. Add it to avoid warning message. Signed-off-by: Sean Anderson Reviewed-by: Michael Walle Fixes: eba5bea8f37f ("arm64: dts: ls1028a: add efuse node") Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index 92465f777603..d5cdd77e5a95 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -224,9 +224,12 @@ little-endian; }; - efuse@1e80000 { + sfp: efuse@1e80000 { compatible = "fsl,ls1028a-sfp"; reg = <0x0 0x1e80000 0x0 0x10000>; + clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL + QORIQ_CLK_PLL_DIV(4)>; + clock-names = "sfp"; #address-cells = <1>; #size-cells = <1>; From e95ea0f687e679fcb0a3a67d0755b81ee7d60db0 Mon Sep 17 00:00:00 2001 From: Kris Bahnsen Date: Thu, 30 Jun 2022 14:03:27 -0700 Subject: [PATCH 031/248] ARM: dts: imx6qdl-ts7970: Fix ngpio typo and count Device-tree incorrectly used "ngpio" which caused the driver to fallback to 32 ngpios. This platform has 62 GPIO registers. Fixes: 9ff8e9fccef9 ("ARM: dts: TS-7970: add basic device tree") Signed-off-by: Kris Bahnsen Reviewed-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6qdl-ts7970.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/imx6qdl-ts7970.dtsi b/arch/arm/boot/dts/imx6qdl-ts7970.dtsi index fded07f370b3..d6ba4b2a60f6 100644 --- a/arch/arm/boot/dts/imx6qdl-ts7970.dtsi +++ b/arch/arm/boot/dts/imx6qdl-ts7970.dtsi @@ -226,7 +226,7 @@ reg = <0x28>; #gpio-cells = <2>; gpio-controller; - ngpio = <32>; + ngpios = <62>; }; sgtl5000: codec@a { From efa310ba00716d7a872bdc5fa1f5545edc9efd69 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 29 Jun 2022 21:07:33 +0100 Subject: [PATCH 032/248] riscv: dts: microchip: hook up the mpfs' l2cache The initial PolarFire SoC devicetree must have been forked off from the fu540 one prior to the addition of l2cache controller support being added there. When the controller node was added to mpfs.dtsi, it was not hooked up to the CPUs & thus sysfs reports an incorrect cache configuration. Hook it up. Fixes: 0fa6107eca41 ("RISC-V: Initial DTS for Microchip ICICLE board") Reviewed-by: Sudeep Holla Reviewed-by: Daire McNamara Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/microchip/mpfs.dtsi | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/boot/dts/microchip/mpfs.dtsi b/arch/riscv/boot/dts/microchip/mpfs.dtsi index 3095d08453a1..496d3b7642bd 100644 --- a/arch/riscv/boot/dts/microchip/mpfs.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs.dtsi @@ -50,6 +50,7 @@ riscv,isa = "rv64imafdc"; clocks = <&clkcfg CLK_CPU>; tlb-split; + next-level-cache = <&cctrllr>; status = "okay"; cpu1_intc: interrupt-controller { @@ -77,6 +78,7 @@ riscv,isa = "rv64imafdc"; clocks = <&clkcfg CLK_CPU>; tlb-split; + next-level-cache = <&cctrllr>; status = "okay"; cpu2_intc: interrupt-controller { @@ -104,6 +106,7 @@ riscv,isa = "rv64imafdc"; clocks = <&clkcfg CLK_CPU>; tlb-split; + next-level-cache = <&cctrllr>; status = "okay"; cpu3_intc: interrupt-controller { @@ -131,6 +134,7 @@ riscv,isa = "rv64imafdc"; clocks = <&clkcfg CLK_CPU>; tlb-split; + next-level-cache = <&cctrllr>; status = "okay"; cpu4_intc: interrupt-controller { #interrupt-cells = <1>; From 6ece49c56965544262523dae4a071ace3db63507 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Wed, 6 Jul 2022 13:06:22 +0300 Subject: [PATCH 033/248] spi: amd: Limit max transfer and message size Enabling the SPI CS35L41 audio codec driver for Steam Deck [1] revealed a problem with the current AMD SPI controller driver implementation, consisting of an unrecoverable system hang. The issue can be prevented if we ensure the max transfer size and the max message size do not exceed the FIFO buffer size. According to the implementation of the downstream driver, the AMD SPI controller is not able to handle more than 70 bytes per transfer, which corresponds to the size of the FIFO buffer. Hence, let's fix this by setting the SPI limits mentioned above. [1] https://lore.kernel.org/r/20220621213819.262537-1-cristian.ciocaltea@collabora.com Reported-by: Anastasios Vacharakis Fixes: bbb336f39efc ("spi: spi-amd: Add AMD SPI controller driver support") Signed-off-by: Cristian Ciocaltea Link: https://lore.kernel.org/r/20220706100626.1234731-2-cristian.ciocaltea@collabora.com Signed-off-by: Mark Brown --- drivers/spi/spi-amd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/spi/spi-amd.c b/drivers/spi/spi-amd.c index cba6a4486c24..efdcbe6c4c26 100644 --- a/drivers/spi/spi-amd.c +++ b/drivers/spi/spi-amd.c @@ -33,6 +33,7 @@ #define AMD_SPI_RX_COUNT_REG 0x4B #define AMD_SPI_STATUS_REG 0x4C +#define AMD_SPI_FIFO_SIZE 70 #define AMD_SPI_MEM_SIZE 200 /* M_CMD OP codes for SPI */ @@ -270,6 +271,11 @@ static int amd_spi_master_transfer(struct spi_master *master, return 0; } +static size_t amd_spi_max_transfer_size(struct spi_device *spi) +{ + return AMD_SPI_FIFO_SIZE; +} + static int amd_spi_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -302,6 +308,8 @@ static int amd_spi_probe(struct platform_device *pdev) master->flags = SPI_MASTER_HALF_DUPLEX; master->setup = amd_spi_master_setup; master->transfer_one_message = amd_spi_master_transfer; + master->max_transfer_size = amd_spi_max_transfer_size; + master->max_message_size = amd_spi_max_transfer_size; /* Register the controller with SPI framework */ err = devm_spi_register_master(dev, master); From fa293fb960ab8350c92e2327a08fc141f228b044 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 7 Jul 2022 13:26:45 +0200 Subject: [PATCH 034/248] MAINTAINERS: mark ARM/PALM TREO SUPPORT orphan The email address sleep_walker@suse.com and the url http://hackndev.com/, provided in the ARM/PALM TREO SUPPORT section, are not reachable anymore. Make this machine support orphan, and give somebody the chance to step up. Move the maintainer into CREDITS to keep the attribution to his work. Signed-off-by: Lukas Bulwahn Signed-off-by: Arnd Bergmann --- CREDITS | 4 ++++ MAINTAINERS | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CREDITS b/CREDITS index 7e85a53b6a88..40d3c655b567 100644 --- a/CREDITS +++ b/CREDITS @@ -627,6 +627,10 @@ S: 48287 Sawleaf S: Fremont, California 94539 S: USA +N: Tomas Cech +E: sleep_walker@suse.com +D: arm/palm treo support + N: Florent Chabaud E: florent.chabaud@polytechnique.org D: software suspend diff --git a/MAINTAINERS b/MAINTAINERS index 31f607347820..e20124db1381 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2499,10 +2499,8 @@ F: drivers/power/reset/oxnas-restart.c N: oxnas ARM/PALM TREO SUPPORT -M: Tomas Cech L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Maintained -W: http://hackndev.com +S: Orphan F: arch/arm/mach-pxa/palmtreo.* ARM/PALMTX,PALMT5,PALMLD,PALMTE2,PALMTC SUPPORT From ccd3f449052449a917a3e577d8ba0368f43b8f29 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 27 Jun 2022 21:23:25 -0500 Subject: [PATCH 035/248] scsi: target: Fix WRITE_SAME No Data Buffer crash In newer version of the SBC specs, we have a NDOB bit that indicates there is no data buffer that gets written out. If this bit is set using commands like "sg_write_same --ndob" we will crash in target_core_iblock/file's execute_write_same handlers when we go to access the se_cmd->t_data_sg because its NULL. This patch adds a check for the NDOB bit in the common WRITE SAME code because we don't support it. And, it adds a check for zero SG elements in each handler in case the initiator tries to send a normal WRITE SAME with no data buffer. Link: https://lore.kernel.org/r/20220628022325.14627-2-michael.christie@oracle.com Reviewed-by: Christoph Hellwig Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/target_core_file.c | 3 +++ drivers/target/target_core_iblock.c | 4 ++++ drivers/target/target_core_sbc.c | 6 ++++++ 3 files changed, 13 insertions(+) diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index e68f1cc8ef98..6c8d8b051bfd 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -448,6 +448,9 @@ fd_execute_write_same(struct se_cmd *cmd) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } + if (!cmd->t_data_nents) + return TCM_INVALID_CDB_FIELD; + if (cmd->t_data_nents > 1 || cmd->t_data_sg[0].length != cmd->se_dev->dev_attrib.block_size) { pr_err("WRITE_SAME: Illegal SGL t_data_nents: %u length: %u" diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 378c80313a0f..1ed9381751e6 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -494,6 +494,10 @@ iblock_execute_write_same(struct se_cmd *cmd) " backends not supported\n"); return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } + + if (!cmd->t_data_nents) + return TCM_INVALID_CDB_FIELD; + sg = &cmd->t_data_sg[0]; if (cmd->t_data_nents > 1 || diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c index ca1b2312d6e7..f6132836eb38 100644 --- a/drivers/target/target_core_sbc.c +++ b/drivers/target/target_core_sbc.c @@ -312,6 +312,12 @@ sbc_setup_write_same(struct se_cmd *cmd, unsigned char flags, struct sbc_ops *op pr_warn("WRITE SAME with ANCHOR not supported\n"); return TCM_INVALID_CDB_FIELD; } + + if (flags & 0x01) { + pr_warn("WRITE SAME with NDOB not supported\n"); + return TCM_INVALID_CDB_FIELD; + } + /* * Special case for WRITE_SAME w/ UNMAP=1 that ends up getting * translated into block discard requests within backend code. From dc5cb7a833e83a0d51373ba7d96e2ed8e1890944 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Wed, 8 Jun 2022 14:08:49 +0200 Subject: [PATCH 036/248] riscv: don't warn for sifive erratas in modules The SiFive errata code contains code checking applicable erratas vs. actually applied erratas to suggest missing erratas to the user when their Kconfig options are not enabled. In the main kernel image one can be quite sure that all available erratas appear at least once, so that check will succeed. On the other hand modules can very well not use any errata-relevant code, so the newly added module-alternative support may also patch the module code, but not touch SiFive-specific erratas at all. So to restore the original behaviour don't warn when patching modules. This will keep the warning if necessary for the main kernel image but prevent spurious warnings for modules. Of course having such a vendor-specific warning may not be needed at all, as CONFIG_ERRATA_SIFIVE is selected by CONFIG_SOC_SIFIVE and the individual erratas are default-y so disabling them requires deliberate action anyway. But for now just restore the old behaviour. Fixes: a8e910168bba ("riscv: implement module alternatives") Reported-by: Ron Economos Signed-off-by: Heiko Stuebner Tested-by: Ron Economos Link: https://lore.kernel.org/r/20220608120849.1695191-1-heiko@sntech.de Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/errata/sifive/errata.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c index 672f02b21ce0..1031038423e7 100644 --- a/arch/riscv/errata/sifive/errata.c +++ b/arch/riscv/errata/sifive/errata.c @@ -111,6 +111,7 @@ void __init_or_module sifive_errata_patch_func(struct alt_entry *begin, cpu_apply_errata |= tmp; } } - if (cpu_apply_errata != cpu_req_errata) + if (stage != RISCV_ALTERNATIVES_MODULE && + cpu_apply_errata != cpu_req_errata) warn_miss_errata(cpu_req_errata - cpu_apply_errata); } From 2058dc831ff82eb8e93e882efd1ca964bd8a74c8 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 7 Jul 2022 15:20:42 +0100 Subject: [PATCH 037/248] MAINTAINERS: add polarfire rng, pci and clock drivers Hardware random, PCI and clock drivers for the PolarFire SoC have been upstreamed but are not covered by the MAINTAINERS entry, so add them. Daire is the author of the clock & PCI drivers, so add him as a maintainer in place of Lewis. Signed-off-by: Conor Dooley Acked-by: Bjorn Helgaas Acked-by: Stephen Boyd Link: https://lore.kernel.org/r/20220707142041.4096246-1-conor.dooley@microchip.com' Signed-off-by: Arnd Bergmann --- MAINTAINERS | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e20124db1381..7a3eab75f967 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17202,12 +17202,15 @@ N: riscv K: riscv RISC-V/MICROCHIP POLARFIRE SOC SUPPORT -M: Lewis Hanly M: Conor Dooley +M: Daire McNamara L: linux-riscv@lists.infradead.org S: Supported F: arch/riscv/boot/dts/microchip/ +F: drivers/char/hw_random/mpfs-rng.c +F: drivers/clk/microchip/clk-mpfs.c F: drivers/mailbox/mailbox-mpfs.c +F: drivers/pci/controller/pcie-microchip-host.c F: drivers/soc/microchip/ F: include/soc/microchip/mpfs.h From d4fac258d971bead9a6b5c5ebe2f0e415d05d110 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Jun 2022 14:52:16 +0800 Subject: [PATCH 038/248] optee: smc_abi.c: fix wrong pointer passed to IS_ERR/PTR_ERR() In optee_smc_do_call_with_arg() there is a code path when the argument struct for RPC is passed appended to the primary argument struct. When the address of the RPC struct is retrieved there's an invalid check for success. It should be 'rpc_arg' pass to IS_ERR/PTR_ERR(). Fixes: ed8faf6c8f8c ("optee: add OPTEE_SMC_CALL_WITH_RPC_ARG and OPTEE_SMC_CALL_WITH_REGD_ARG") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang [jw: added background to the problem] Signed-off-by: Jens Wiklander --- drivers/tee/optee/smc_abi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tee/optee/smc_abi.c b/drivers/tee/optee/smc_abi.c index 385cb0aee610..a1c1fa1a9c28 100644 --- a/drivers/tee/optee/smc_abi.c +++ b/drivers/tee/optee/smc_abi.c @@ -884,8 +884,8 @@ static int optee_smc_do_call_with_arg(struct tee_context *ctx, rpc_arg_offs = OPTEE_MSG_GET_ARG_SIZE(arg->num_params); rpc_arg = tee_shm_get_va(shm, offs + rpc_arg_offs); - if (IS_ERR(arg)) - return PTR_ERR(arg); + if (IS_ERR(rpc_arg)) + return PTR_ERR(rpc_arg); } if (rpc_arg && tee_shm_is_dynamic(shm)) { From 6177a50fd32c6fd956c7265bc5297e725d221bfc Mon Sep 17 00:00:00 2001 From: Max Krummenacher Date: Tue, 5 Jul 2022 10:58:24 +0200 Subject: [PATCH 039/248] ARM: dts: colibri-imx6ull: fix snvs pinmux group A pin controlled by the iomuxc-snvs pin controller must be specified under the dtb's iomuxc-snvs node. Move the one and only pin of that category from the iomuxc node and set the pinctrl-0 using it accordingly. Fixes: 2aa9d6201949 ("ARM: dts: imx6ull-colibri: add touchscreen device nodes") Signed-off-by: Max Krummenacher Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6ull-colibri.dtsi | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/imx6ull-colibri.dtsi b/arch/arm/boot/dts/imx6ull-colibri.dtsi index 15621e03fa4d..2c3ae715c683 100644 --- a/arch/arm/boot/dts/imx6ull-colibri.dtsi +++ b/arch/arm/boot/dts/imx6ull-colibri.dtsi @@ -166,7 +166,7 @@ atmel_mxt_ts: touchscreen@4a { compatible = "atmel,maxtouch"; pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_atmel_conn>; + pinctrl-0 = <&pinctrl_atmel_conn &pinctrl_atmel_snvs_conn>; reg = <0x4a>; interrupt-parent = <&gpio5>; interrupts = <4 IRQ_TYPE_EDGE_FALLING>; /* SODIMM 107 / INT */ @@ -331,7 +331,6 @@ pinctrl_atmel_conn: atmelconngrp { fsl,pins = < MX6UL_PAD_JTAG_MOD__GPIO1_IO10 0xb0a0 /* SODIMM 106 */ - MX6ULL_PAD_SNVS_TAMPER4__GPIO5_IO04 0xb0a0 /* SODIMM 107 */ >; }; @@ -684,6 +683,12 @@ }; &iomuxc_snvs { + pinctrl_atmel_snvs_conn: atmelsnvsconngrp { + fsl,pins = < + MX6ULL_PAD_SNVS_TAMPER4__GPIO5_IO04 0xb0a0 /* SODIMM 107 */ + >; + }; + pinctrl_snvs_gpio1: snvsgpio1grp { fsl,pins = < MX6ULL_PAD_SNVS_TAMPER6__GPIO5_IO06 0x110a0 /* SODIMM 93 */ From 86c43ea071ae9988b52fd0f654de439da4b5c20a Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 4 Jul 2022 17:08:08 +0200 Subject: [PATCH 040/248] ARM: dts: kswitch-d10: use open drain mode for coma-mode pins The driver use the coma-mode pins as open-drain. Flag them in the device tree accordingly. This avoids the following error: [ 14.114180] gpio-2007 (coma-mode): enforced open drain please flag it properly in DT/ACPI DSDT/board file Fixes: 46a9556d977e ("ARM: dts: kswitch-d10: enable networking") Signed-off-by: Michael Walle Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220704150808.1104295-1-michael@walle.cc --- arch/arm/boot/dts/lan966x-kontron-kswitch-d10-mmt.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/lan966x-kontron-kswitch-d10-mmt.dtsi b/arch/arm/boot/dts/lan966x-kontron-kswitch-d10-mmt.dtsi index 4cab1b3b3b29..725dcf707b31 100644 --- a/arch/arm/boot/dts/lan966x-kontron-kswitch-d10-mmt.dtsi +++ b/arch/arm/boot/dts/lan966x-kontron-kswitch-d10-mmt.dtsi @@ -87,22 +87,22 @@ phy4: ethernet-phy@5 { reg = <5>; - coma-mode-gpios = <&gpio 37 GPIO_ACTIVE_HIGH>; + coma-mode-gpios = <&gpio 37 GPIO_OPEN_DRAIN>; }; phy5: ethernet-phy@6 { reg = <6>; - coma-mode-gpios = <&gpio 37 GPIO_ACTIVE_HIGH>; + coma-mode-gpios = <&gpio 37 GPIO_OPEN_DRAIN>; }; phy6: ethernet-phy@7 { reg = <7>; - coma-mode-gpios = <&gpio 37 GPIO_ACTIVE_HIGH>; + coma-mode-gpios = <&gpio 37 GPIO_OPEN_DRAIN>; }; phy7: ethernet-phy@8 { reg = <8>; - coma-mode-gpios = <&gpio 37 GPIO_ACTIVE_HIGH>; + coma-mode-gpios = <&gpio 37 GPIO_OPEN_DRAIN>; }; }; From b66527ee98d0e12fbf570d394fbea2be4ef1229e Mon Sep 17 00:00:00 2001 From: Jiang Jian Date: Wed, 22 Jun 2022 00:16:48 +0800 Subject: [PATCH 041/248] optee: Remove duplicate 'of' in two places. file: ./drivers/tee/optee/optee_smc.h line: 192 * a2 Size of of SHM chanegd to * a2 Size of SHM Signed-off-by: Jiang Jian Signed-off-by: Jens Wiklander --- drivers/tee/optee/optee_smc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tee/optee/optee_smc.h b/drivers/tee/optee/optee_smc.h index c60896cf71cb..73b5e7760d10 100644 --- a/drivers/tee/optee/optee_smc.h +++ b/drivers/tee/optee/optee_smc.h @@ -189,7 +189,7 @@ struct optee_smc_call_get_os_revision_result { * Have config return register usage: * a0 OPTEE_SMC_RETURN_OK * a1 Physical address of start of SHM - * a2 Size of of SHM + * a2 Size of SHM * a3 Cache settings of memory, as defined by the * OPTEE_SMC_SHM_* values above * a4-7 Preserved From e5ce073c8a1e01b215a5eb32ba48f8d17ded3bd5 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 6 Jun 2022 13:43:53 +0200 Subject: [PATCH 042/248] tee: tee_get_drvdata(): fix description of return value This patch fixes the description of tee_get_drvdata()'s return value. It actually returns the driver_data pointer supplied to tee_device_alloc() since the TEE subsystem was added to the kernel. Fixes: 967c9cca2cc5 ("tee: generic TEE subsystem") Cc: Jens Wiklander Signed-off-by: Marc Kleine-Budde Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index af0f7c603fa4..98da206cd761 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -1073,7 +1073,7 @@ EXPORT_SYMBOL_GPL(tee_device_unregister); /** * tee_get_drvdata() - Return driver_data pointer * @teedev: Device containing the driver_data pointer - * @returns the driver_data pointer supplied to tee_register(). + * @returns the driver_data pointer supplied to tee_device_alloc(). */ void *tee_get_drvdata(struct tee_device *teedev) { From 925b6e59138cefa47275c67891c65d48d3266d57 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Fri, 8 Jul 2022 02:30:47 -0700 Subject: [PATCH 043/248] Revert "drm/amdgpu: add drm buddy support to amdgpu" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit c9cad937c0c58618fe5b0310fd539a854dc1ae95. This is part of a revert of the following commits: commit 708d19d9f362 ("drm/amdgpu: move internal vram_mgr function into the C file") commit 5e3f1e7729ec ("drm/amdgpu: fix start calculation in amdgpu_vram_mgr_new") commit c9cad937c0c5 ("drm/amdgpu: add drm buddy support to amdgpu") [WHY] Few users reported garbaged graphics as soon as x starts, reverting until this can be resolved. Signed-off-by: Arunpravin Paneer Selvam Link: https://patchwork.freedesktop.org/patch/msgid/20220708093047.492662-3-Arunpravin.PaneerSelvam@amd.com Reviewed-by: Christian König Signed-off-by: Christian König --- drivers/gpu/drm/Kconfig | 1 - .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h | 97 +---- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 10 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 349 +++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 89 ----- 5 files changed, 171 insertions(+), 375 deletions(-) delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index e88c497fa010..f65656df3619 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -256,7 +256,6 @@ config DRM_AMDGPU select HWMON select BACKLIGHT_CLASS_DEVICE select INTERVAL_TREE - select DRM_BUDDY help Choose this option if you have a recent AMD Radeon graphics card. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 6546552e596c..acfa207cf970 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -30,15 +30,12 @@ #include #include -#include "amdgpu_vram_mgr.h" - /* state back for walking over vram_mgr and gtt_mgr allocations */ struct amdgpu_res_cursor { uint64_t start; uint64_t size; uint64_t remaining; - void *node; - uint32_t mem_type; + struct drm_mm_node *node; }; /** @@ -55,63 +52,27 @@ static inline void amdgpu_res_first(struct ttm_resource *res, uint64_t start, uint64_t size, struct amdgpu_res_cursor *cur) { - struct drm_buddy_block *block; - struct list_head *head, *next; struct drm_mm_node *node; - if (!res) - goto fallback; + if (!res || res->mem_type == TTM_PL_SYSTEM) { + cur->start = start; + cur->size = size; + cur->remaining = size; + cur->node = NULL; + WARN_ON(res && start + size > res->num_pages << PAGE_SHIFT); + return; + } BUG_ON(start + size > res->num_pages << PAGE_SHIFT); - cur->mem_type = res->mem_type; + node = to_ttm_range_mgr_node(res)->mm_nodes; + while (start >= node->size << PAGE_SHIFT) + start -= node++->size << PAGE_SHIFT; - switch (cur->mem_type) { - case TTM_PL_VRAM: - head = &to_amdgpu_vram_mgr_resource(res)->blocks; - - block = list_first_entry_or_null(head, - struct drm_buddy_block, - link); - if (!block) - goto fallback; - - while (start >= amdgpu_vram_mgr_block_size(block)) { - start -= amdgpu_vram_mgr_block_size(block); - - next = block->link.next; - if (next != head) - block = list_entry(next, struct drm_buddy_block, link); - } - - cur->start = amdgpu_vram_mgr_block_start(block) + start; - cur->size = min(amdgpu_vram_mgr_block_size(block) - start, size); - cur->remaining = size; - cur->node = block; - break; - case TTM_PL_TT: - node = to_ttm_range_mgr_node(res)->mm_nodes; - while (start >= node->size << PAGE_SHIFT) - start -= node++->size << PAGE_SHIFT; - - cur->start = (node->start << PAGE_SHIFT) + start; - cur->size = min((node->size << PAGE_SHIFT) - start, size); - cur->remaining = size; - cur->node = node; - break; - default: - goto fallback; - } - - return; - -fallback: - cur->start = start; - cur->size = size; + cur->start = (node->start << PAGE_SHIFT) + start; + cur->size = min((node->size << PAGE_SHIFT) - start, size); cur->remaining = size; - cur->node = NULL; - WARN_ON(res && start + size > res->num_pages << PAGE_SHIFT); - return; + cur->node = node; } /** @@ -124,9 +85,7 @@ fallback: */ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) { - struct drm_buddy_block *block; - struct drm_mm_node *node; - struct list_head *next; + struct drm_mm_node *node = cur->node; BUG_ON(size > cur->remaining); @@ -140,27 +99,9 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) return; } - switch (cur->mem_type) { - case TTM_PL_VRAM: - block = cur->node; - - next = block->link.next; - block = list_entry(next, struct drm_buddy_block, link); - - cur->node = block; - cur->start = amdgpu_vram_mgr_block_start(block); - cur->size = min(amdgpu_vram_mgr_block_size(block), cur->remaining); - break; - case TTM_PL_TT: - node = cur->node; - - cur->node = ++node; - cur->start = node->start << PAGE_SHIFT; - cur->size = min(node->size << PAGE_SHIFT, cur->remaining); - break; - default: - return; - } + cur->node = ++node; + cur->start = node->start << PAGE_SHIFT; + cur->size = min(node->size << PAGE_SHIFT, cur->remaining); } #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 6a70818039dd..9120ae80ef52 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -26,7 +26,6 @@ #include #include -#include "amdgpu_vram_mgr.h" #include "amdgpu.h" #define AMDGPU_PL_GDS (TTM_PL_PRIV + 0) @@ -39,6 +38,15 @@ #define AMDGPU_POISON 0xd0bed0be +struct amdgpu_vram_mgr { + struct ttm_resource_manager manager; + struct drm_mm mm; + spinlock_t lock; + struct list_head reservations_pending; + struct list_head reserved_pages; + atomic64_t vis_usage; +}; + struct amdgpu_gtt_mgr { struct ttm_resource_manager manager; struct drm_mm mm; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 49e4092f447f..0a7611648573 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -32,10 +32,8 @@ #include "atom.h" struct amdgpu_vram_reservation { - u64 start; - u64 size; - struct list_head allocated; - struct list_head blocks; + struct list_head node; + struct drm_mm_node mm_node; }; static inline struct amdgpu_vram_mgr * @@ -188,18 +186,18 @@ const struct attribute_group amdgpu_vram_mgr_attr_group = { }; /** - * amdgpu_vram_mgr_vis_size - Calculate visible block size + * amdgpu_vram_mgr_vis_size - Calculate visible node size * * @adev: amdgpu_device pointer - * @block: DRM BUDDY block structure + * @node: MM node structure * - * Calculate how many bytes of the DRM BUDDY block are inside visible VRAM + * Calculate how many bytes of the MM node are inside visible VRAM */ static u64 amdgpu_vram_mgr_vis_size(struct amdgpu_device *adev, - struct drm_buddy_block *block) + struct drm_mm_node *node) { - u64 start = amdgpu_vram_mgr_block_start(block); - u64 end = start + amdgpu_vram_mgr_block_size(block); + uint64_t start = node->start << PAGE_SHIFT; + uint64_t end = (node->size + node->start) << PAGE_SHIFT; if (start >= adev->gmc.visible_vram_size) return 0; @@ -220,9 +218,9 @@ u64 amdgpu_vram_mgr_bo_visible_size(struct amdgpu_bo *bo) { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct ttm_resource *res = bo->tbo.resource; - struct amdgpu_vram_mgr_resource *vres = to_amdgpu_vram_mgr_resource(res); - struct drm_buddy_block *block; - u64 usage = 0; + unsigned pages = res->num_pages; + struct drm_mm_node *mm; + u64 usage; if (amdgpu_gmc_vram_full_visible(&adev->gmc)) return amdgpu_bo_size(bo); @@ -230,8 +228,9 @@ u64 amdgpu_vram_mgr_bo_visible_size(struct amdgpu_bo *bo) if (res->start >= adev->gmc.visible_vram_size >> PAGE_SHIFT) return 0; - list_for_each_entry(block, &vres->blocks, link) - usage += amdgpu_vram_mgr_vis_size(adev, block); + mm = &container_of(res, struct ttm_range_mgr_node, base)->mm_nodes[0]; + for (usage = 0; pages; pages -= mm->size, mm++) + usage += amdgpu_vram_mgr_vis_size(adev, mm); return usage; } @@ -241,30 +240,23 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man) { struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); - struct drm_buddy *mm = &mgr->mm; + struct drm_mm *mm = &mgr->mm; struct amdgpu_vram_reservation *rsv, *temp; - struct drm_buddy_block *block; uint64_t vis_usage; - list_for_each_entry_safe(rsv, temp, &mgr->reservations_pending, blocks) { - if (drm_buddy_alloc_blocks(mm, rsv->start, rsv->start + rsv->size, - rsv->size, mm->chunk_size, &rsv->allocated, - DRM_BUDDY_RANGE_ALLOCATION)) - continue; - - block = amdgpu_vram_mgr_first_block(&rsv->allocated); - if (!block) + list_for_each_entry_safe(rsv, temp, &mgr->reservations_pending, node) { + if (drm_mm_reserve_node(mm, &rsv->mm_node)) continue; dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n", - rsv->start, rsv->size); + rsv->mm_node.start, rsv->mm_node.size); - vis_usage = amdgpu_vram_mgr_vis_size(adev, block); + vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node); atomic64_add(vis_usage, &mgr->vis_usage); spin_lock(&man->bdev->lru_lock); - man->usage += rsv->size; + man->usage += rsv->mm_node.size << PAGE_SHIFT; spin_unlock(&man->bdev->lru_lock); - list_move(&rsv->blocks, &mgr->reserved_pages); + list_move(&rsv->node, &mgr->reserved_pages); } } @@ -286,16 +278,14 @@ int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr *mgr, if (!rsv) return -ENOMEM; - INIT_LIST_HEAD(&rsv->allocated); - INIT_LIST_HEAD(&rsv->blocks); + INIT_LIST_HEAD(&rsv->node); + rsv->mm_node.start = start >> PAGE_SHIFT; + rsv->mm_node.size = size >> PAGE_SHIFT; - rsv->start = start; - rsv->size = size; - - mutex_lock(&mgr->lock); - list_add_tail(&rsv->blocks, &mgr->reservations_pending); + spin_lock(&mgr->lock); + list_add_tail(&rsv->node, &mgr->reservations_pending); amdgpu_vram_mgr_do_reserve(&mgr->manager); - mutex_unlock(&mgr->lock); + spin_unlock(&mgr->lock); return 0; } @@ -317,19 +307,19 @@ int amdgpu_vram_mgr_query_page_status(struct amdgpu_vram_mgr *mgr, struct amdgpu_vram_reservation *rsv; int ret; - mutex_lock(&mgr->lock); + spin_lock(&mgr->lock); - list_for_each_entry(rsv, &mgr->reservations_pending, blocks) { - if (rsv->start <= start && - (start < (rsv->start + rsv->size))) { + list_for_each_entry(rsv, &mgr->reservations_pending, node) { + if ((rsv->mm_node.start <= start) && + (start < (rsv->mm_node.start + rsv->mm_node.size))) { ret = -EBUSY; goto out; } } - list_for_each_entry(rsv, &mgr->reserved_pages, blocks) { - if (rsv->start <= start && - (start < (rsv->start + rsv->size))) { + list_for_each_entry(rsv, &mgr->reserved_pages, node) { + if ((rsv->mm_node.start <= start) && + (start < (rsv->mm_node.start + rsv->mm_node.size))) { ret = 0; goto out; } @@ -337,10 +327,32 @@ int amdgpu_vram_mgr_query_page_status(struct amdgpu_vram_mgr *mgr, ret = -ENOENT; out: - mutex_unlock(&mgr->lock); + spin_unlock(&mgr->lock); return ret; } +/** + * amdgpu_vram_mgr_virt_start - update virtual start address + * + * @mem: ttm_resource to update + * @node: just allocated node + * + * Calculate a virtual BO start address to easily check if everything is CPU + * accessible. + */ +static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem, + struct drm_mm_node *node) +{ + unsigned long start; + + start = node->start + node->size; + if (start > mem->num_pages) + start -= mem->num_pages; + else + start = 0; + mem->start = max(mem->start, start); +} + /** * amdgpu_vram_mgr_new - allocate new ranges * @@ -356,44 +368,46 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, const struct ttm_place *place, struct ttm_resource **res) { - u64 vis_usage = 0, max_bytes, cur_size, min_block_size; + unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages; struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); - struct amdgpu_vram_mgr_resource *vres; - u64 size, remaining_size, lpfn, fpfn; - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; - unsigned long pages_per_block; + uint64_t vis_usage = 0, mem_bytes, max_bytes; + struct ttm_range_mgr_node *node; + struct drm_mm *mm = &mgr->mm; + enum drm_mm_insert_mode mode; + unsigned i; int r; - lpfn = place->lpfn << PAGE_SHIFT; + lpfn = place->lpfn; if (!lpfn) - lpfn = man->size; - - fpfn = place->fpfn << PAGE_SHIFT; + lpfn = man->size >> PAGE_SHIFT; max_bytes = adev->gmc.mc_vram_size; if (tbo->type != ttm_bo_type_kernel) max_bytes -= AMDGPU_VM_RESERVED_VRAM; + mem_bytes = tbo->base.size; if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - pages_per_block = ~0ul; + pages_per_node = ~0ul; + num_nodes = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - pages_per_block = HPAGE_PMD_NR; + pages_per_node = HPAGE_PMD_NR; #else /* default to 2MB */ - pages_per_block = 2UL << (20UL - PAGE_SHIFT); + pages_per_node = 2UL << (20UL - PAGE_SHIFT); #endif - pages_per_block = max_t(uint32_t, pages_per_block, - tbo->page_alignment); + pages_per_node = max_t(uint32_t, pages_per_node, + tbo->page_alignment); + num_nodes = DIV_ROUND_UP_ULL(PFN_UP(mem_bytes), pages_per_node); } - vres = kzalloc(sizeof(*vres), GFP_KERNEL); - if (!vres) + node = kvmalloc(struct_size(node, mm_nodes, num_nodes), + GFP_KERNEL | __GFP_ZERO); + if (!node) return -ENOMEM; - ttm_resource_init(tbo, place, &vres->base); + ttm_resource_init(tbo, place, &node->base); /* bail out quickly if there's likely not enough VRAM for this BO */ if (ttm_resource_manager_usage(man) > max_bytes) { @@ -401,130 +415,66 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, goto error_fini; } - INIT_LIST_HEAD(&vres->blocks); - + mode = DRM_MM_INSERT_BEST; if (place->flags & TTM_PL_FLAG_TOPDOWN) - vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; + mode = DRM_MM_INSERT_HIGH; - if (fpfn || lpfn != man->size) - /* Allocate blocks in desired range */ - vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; + pages_left = node->base.num_pages; - remaining_size = vres->base.num_pages << PAGE_SHIFT; + /* Limit maximum size to 2GB due to SG table limitations */ + pages = min(pages_left, 2UL << (30 - PAGE_SHIFT)); - mutex_lock(&mgr->lock); - while (remaining_size) { - if (tbo->page_alignment) - min_block_size = tbo->page_alignment << PAGE_SHIFT; - else - min_block_size = mgr->default_page_size; + i = 0; + spin_lock(&mgr->lock); + while (pages_left) { + uint32_t alignment = tbo->page_alignment; - BUG_ON(min_block_size < mm->chunk_size); + if (pages >= pages_per_node) + alignment = pages_per_node; - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); - - if (size >= pages_per_block << PAGE_SHIFT) - min_block_size = pages_per_block << PAGE_SHIFT; - - cur_size = size; - - if (fpfn + size != place->lpfn << PAGE_SHIFT) { - /* - * Except for actual range allocation, modify the size and - * min_block_size conforming to continuous flag enablement - */ - if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - size = roundup_pow_of_two(size); - min_block_size = size; - /* - * Modify the size value if size is not - * aligned with min_block_size - */ - } else if (!IS_ALIGNED(size, min_block_size)) { - size = round_up(size, min_block_size); + r = drm_mm_insert_node_in_range(mm, &node->mm_nodes[i], pages, + alignment, 0, place->fpfn, + lpfn, mode); + if (unlikely(r)) { + if (pages > pages_per_node) { + if (is_power_of_2(pages)) + pages = pages / 2; + else + pages = rounddown_pow_of_two(pages); + continue; } + goto error_free; } - r = drm_buddy_alloc_blocks(mm, fpfn, - lpfn, - size, - min_block_size, - &vres->blocks, - vres->flags); - if (unlikely(r)) - goto error_free_blocks; + vis_usage += amdgpu_vram_mgr_vis_size(adev, &node->mm_nodes[i]); + amdgpu_vram_mgr_virt_start(&node->base, &node->mm_nodes[i]); + pages_left -= pages; + ++i; - if (size > remaining_size) - remaining_size = 0; - else - remaining_size -= size; + if (pages > pages_left) + pages = pages_left; } - mutex_unlock(&mgr->lock); + spin_unlock(&mgr->lock); - if (cur_size != size) { - struct drm_buddy_block *block; - struct list_head *trim_list; - u64 original_size; - LIST_HEAD(temp); - - trim_list = &vres->blocks; - original_size = vres->base.num_pages << PAGE_SHIFT; - - /* - * If size value is rounded up to min_block_size, trim the last - * block to the required size - */ - if (!list_is_singular(&vres->blocks)) { - block = list_last_entry(&vres->blocks, typeof(*block), link); - list_move_tail(&block->link, &temp); - trim_list = &temp; - /* - * Compute the original_size value by subtracting the - * last block size with (aligned size - original size) - */ - original_size = amdgpu_vram_mgr_block_size(block) - (size - cur_size); - } - - mutex_lock(&mgr->lock); - drm_buddy_block_trim(mm, - original_size, - trim_list); - mutex_unlock(&mgr->lock); - - if (!list_empty(&temp)) - list_splice_tail(trim_list, &vres->blocks); - } - - list_for_each_entry(block, &vres->blocks, link) - vis_usage += amdgpu_vram_mgr_vis_size(adev, block); - - block = amdgpu_vram_mgr_first_block(&vres->blocks); - if (!block) { - r = -EINVAL; - goto error_fini; - } - - vres->base.start = amdgpu_vram_mgr_block_start(block) >> PAGE_SHIFT; - - if (amdgpu_is_vram_mgr_blocks_contiguous(&vres->blocks)) - vres->base.placement |= TTM_PL_FLAG_CONTIGUOUS; + if (i == 1) + node->base.placement |= TTM_PL_FLAG_CONTIGUOUS; if (adev->gmc.xgmi.connected_to_cpu) - vres->base.bus.caching = ttm_cached; + node->base.bus.caching = ttm_cached; else - vres->base.bus.caching = ttm_write_combined; + node->base.bus.caching = ttm_write_combined; atomic64_add(vis_usage, &mgr->vis_usage); - *res = &vres->base; + *res = &node->base; return 0; -error_free_blocks: - drm_buddy_free_list(mm, &vres->blocks); - mutex_unlock(&mgr->lock); +error_free: + while (i--) + drm_mm_remove_node(&node->mm_nodes[i]); + spin_unlock(&mgr->lock); error_fini: - ttm_resource_fini(man, &vres->base); - kfree(vres); + ttm_resource_fini(man, &node->base); + kvfree(node); return r; } @@ -540,26 +490,27 @@ error_fini: static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, struct ttm_resource *res) { - struct amdgpu_vram_mgr_resource *vres = to_amdgpu_vram_mgr_resource(res); + struct ttm_range_mgr_node *node = to_ttm_range_mgr_node(res); struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; uint64_t vis_usage = 0; + unsigned i, pages; - mutex_lock(&mgr->lock); - list_for_each_entry(block, &vres->blocks, link) - vis_usage += amdgpu_vram_mgr_vis_size(adev, block); + spin_lock(&mgr->lock); + for (i = 0, pages = res->num_pages; pages; + pages -= node->mm_nodes[i].size, ++i) { + struct drm_mm_node *mm = &node->mm_nodes[i]; + drm_mm_remove_node(mm); + vis_usage += amdgpu_vram_mgr_vis_size(adev, mm); + } amdgpu_vram_mgr_do_reserve(man); - - drm_buddy_free_list(mm, &vres->blocks); - mutex_unlock(&mgr->lock); + spin_unlock(&mgr->lock); atomic64_sub(vis_usage, &mgr->vis_usage); ttm_resource_fini(man, res); - kfree(vres); + kvfree(node); } /** @@ -591,7 +542,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, if (!*sgt) return -ENOMEM; - /* Determine the number of DRM_BUDDY blocks to export */ + /* Determine the number of DRM_MM nodes to export */ amdgpu_res_first(res, offset, length, &cursor); while (cursor.remaining) { num_entries++; @@ -607,10 +558,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg->length = 0; /* - * Walk down DRM_BUDDY blocks to populate scatterlist nodes - * @note: Use iterator api to get first the DRM_BUDDY block + * Walk down DRM_MM nodes to populate scatterlist nodes + * @note: Use iterator api to get first the DRM_MM node * and the number of bytes from it. Access the following - * DRM_BUDDY block(s) if more buffer needs to exported + * DRM_MM node(s) if more buffer needs to exported */ amdgpu_res_first(res, offset, length, &cursor); for_each_sgtable_sg((*sgt), sg, i) { @@ -697,22 +648,13 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man, struct drm_printer *printer) { struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; drm_printf(printer, " vis usage:%llu\n", amdgpu_vram_mgr_vis_usage(mgr)); - mutex_lock(&mgr->lock); - drm_printf(printer, "default_page_size: %lluKiB\n", - mgr->default_page_size >> 10); - - drm_buddy_print(mm, printer); - - drm_printf(printer, "reserved:\n"); - list_for_each_entry(block, &mgr->reserved_pages, link) - drm_buddy_block_print(mm, block, printer); - mutex_unlock(&mgr->lock); + spin_lock(&mgr->lock); + drm_mm_print(&mgr->mm, printer); + spin_unlock(&mgr->lock); } static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = { @@ -732,21 +674,16 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev) { struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; struct ttm_resource_manager *man = &mgr->manager; - int err; ttm_resource_manager_init(man, &adev->mman.bdev, adev->gmc.real_vram_size); man->func = &amdgpu_vram_mgr_func; - err = drm_buddy_init(&mgr->mm, man->size, PAGE_SIZE); - if (err) - return err; - - mutex_init(&mgr->lock); + drm_mm_init(&mgr->mm, 0, man->size >> PAGE_SHIFT); + spin_lock_init(&mgr->lock); INIT_LIST_HEAD(&mgr->reservations_pending); INIT_LIST_HEAD(&mgr->reserved_pages); - mgr->default_page_size = PAGE_SIZE; ttm_set_driver_manager(&adev->mman.bdev, TTM_PL_VRAM, &mgr->manager); ttm_resource_manager_set_used(man, true); @@ -774,16 +711,16 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) if (ret) return; - mutex_lock(&mgr->lock); - list_for_each_entry_safe(rsv, temp, &mgr->reservations_pending, blocks) + spin_lock(&mgr->lock); + list_for_each_entry_safe(rsv, temp, &mgr->reservations_pending, node) kfree(rsv); - list_for_each_entry_safe(rsv, temp, &mgr->reserved_pages, blocks) { - drm_buddy_free_list(&mgr->mm, &rsv->blocks); + list_for_each_entry_safe(rsv, temp, &mgr->reserved_pages, node) { + drm_mm_remove_node(&rsv->mm_node); kfree(rsv); } - drm_buddy_fini(&mgr->mm); - mutex_unlock(&mgr->lock); + drm_mm_takedown(&mgr->mm); + spin_unlock(&mgr->lock); ttm_resource_manager_cleanup(man); ttm_set_driver_manager(&adev->mman.bdev, TTM_PL_VRAM, NULL); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h deleted file mode 100644 index 9a2db87186c7..000000000000 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h +++ /dev/null @@ -1,89 +0,0 @@ -/* SPDX-License-Identifier: MIT - * Copyright 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef __AMDGPU_VRAM_MGR_H__ -#define __AMDGPU_VRAM_MGR_H__ - -#include - -struct amdgpu_vram_mgr { - struct ttm_resource_manager manager; - struct drm_buddy mm; - /* protects access to buffer objects */ - struct mutex lock; - struct list_head reservations_pending; - struct list_head reserved_pages; - atomic64_t vis_usage; - u64 default_page_size; -}; - -struct amdgpu_vram_mgr_resource { - struct ttm_resource base; - struct list_head blocks; - unsigned long flags; -}; - -static inline u64 amdgpu_vram_mgr_block_start(struct drm_buddy_block *block) -{ - return drm_buddy_block_offset(block); -} - -static inline u64 amdgpu_vram_mgr_block_size(struct drm_buddy_block *block) -{ - return PAGE_SIZE << drm_buddy_block_order(block); -} - -static inline struct drm_buddy_block * -amdgpu_vram_mgr_first_block(struct list_head *list) -{ - return list_first_entry_or_null(list, struct drm_buddy_block, link); -} - -static inline bool amdgpu_is_vram_mgr_blocks_contiguous(struct list_head *head) -{ - struct drm_buddy_block *block; - u64 start, size; - - block = amdgpu_vram_mgr_first_block(head); - if (!block) - return false; - - while (head != block->link.next) { - start = amdgpu_vram_mgr_block_start(block); - size = amdgpu_vram_mgr_block_size(block); - - block = list_entry(block->link.next, struct drm_buddy_block, link); - if (start + size != amdgpu_vram_mgr_block_start(block)) - return false; - } - - return true; -} - -static inline struct amdgpu_vram_mgr_resource * -to_amdgpu_vram_mgr_resource(struct ttm_resource *res) -{ - return container_of(res, struct amdgpu_vram_mgr_resource, base); -} - -#endif From 6fb9e1d94789e8ee5a258a23bc588693f743fd6c Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Fri, 1 Jul 2022 16:08:54 +0800 Subject: [PATCH 044/248] usb: typec: add missing uevent when partner support PD System like Android allow user control power role from UI, it is possible to implement application base on typec uevent to refresh UI, but found there is chance that UI show different state from typec attribute file. In typec_set_pwr_opmode(), when partner support PD, there is no uevent send to user space which cause the problem. Fix it by sending uevent notification when change power mode to PD. Fixes: bdecb33af34f ("usb: typec: API for controlling USB Type-C Multiplexers") Cc: stable@vger.kernel.org Signed-off-by: Linyu Yuan Link: https://lore.kernel.org/r/1656662934-10226-1-git-send-email-quic_linyyuan@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/class.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index ee0e520707dd..c4724750c81a 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -1718,6 +1718,7 @@ void typec_set_pwr_opmode(struct typec_port *port, partner->usb_pd = 1; sysfs_notify(&partner_dev->kobj, NULL, "supports_usb_power_delivery"); + kobject_uevent(&partner_dev->kobj, KOBJ_CHANGE); } put_device(partner_dev); } From 3d0dc539029b09fbd125444c16b11a8ed10b9d0f Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Thu, 7 Jul 2022 13:56:12 +0200 Subject: [PATCH 045/248] usb: gadget: uvc: fix changing interface name via configfs When setting the function name, it is always truncated by one char since snprintf is always including the null-termination in the len parameter. We use strscpy and fix the size setting to use len + 1 instead. Fixes: 324e4f85070f ("usb: gadget: uvc: allow changing interface name via configfs") Signed-off-by: Michael Grzeschik Link: https://lore.kernel.org/r/20220707115612.2760569-1-m.grzeschik@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/uvc_configfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/uvc_configfs.c b/drivers/usb/gadget/function/uvc_configfs.c index e5a6b6e36b3d..4303a3283ba0 100644 --- a/drivers/usb/gadget/function/uvc_configfs.c +++ b/drivers/usb/gadget/function/uvc_configfs.c @@ -2371,6 +2371,7 @@ static ssize_t f_uvc_opts_string_##cname##_store(struct config_item *item,\ const char *page, size_t len) \ { \ struct f_uvc_opts *opts = to_f_uvc_opts(item); \ + int size = min(sizeof(opts->aname), len + 1); \ int ret = 0; \ \ mutex_lock(&opts->lock); \ @@ -2379,8 +2380,9 @@ static ssize_t f_uvc_opts_string_##cname##_store(struct config_item *item,\ goto end; \ } \ \ - ret = snprintf(opts->aname, min(sizeof(opts->aname), len), \ - "%s", page); \ + ret = strscpy(opts->aname, page, size); \ + if (ret == -E2BIG) \ + ret = size - 1; \ \ end: \ mutex_unlock(&opts->lock); \ From bb160ee61c04fe96f3cc0088ef1907214861dccc Mon Sep 17 00:00:00 2001 From: Darren Stevens Date: Sat, 2 Jul 2022 22:03:55 +0100 Subject: [PATCH 046/248] drivers/usb/host/ehci-fsl: Fix interrupt setup in host mode. In patch a1a2b7125e10 (Drop static setup of IRQ resource from DT core) we stopped platform_get_resource() from returning the IRQ, as all drivers were supposed to have switched to platform_get_irq() Unfortunately the Freescale EHCI driver in host mode got missed. Fix it. Fixes: a1a2b7125e10 ("of/platform: Drop static setup of IRQ resource from DT core") Reported-by: Christian Zigotzky Suggested-by: Rob Herring Tested-by: Christian Zigotzky Acked-by: Rob Herring Acked-by: Alan Stern Signed-off-by: Darren Stevens Link: https://lore.kernel.org/r/20220702220355.63b36fb8@Cyrus.lan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-fsl.c | 11 +++-------- drivers/usb/host/fsl-mph-dr-of.c | 3 +++ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c index 385be30baad3..896c0d107f72 100644 --- a/drivers/usb/host/ehci-fsl.c +++ b/drivers/usb/host/ehci-fsl.c @@ -76,14 +76,9 @@ static int fsl_ehci_drv_probe(struct platform_device *pdev) return -ENODEV; } - res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (!res) { - dev_err(&pdev->dev, - "Found HC with no IRQ. Check %s setup!\n", - dev_name(&pdev->dev)); - return -ENODEV; - } - irq = res->start; + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; hcd = __usb_create_hcd(&fsl_ehci_hc_driver, pdev->dev.parent, &pdev->dev, dev_name(&pdev->dev), NULL); diff --git a/drivers/usb/host/fsl-mph-dr-of.c b/drivers/usb/host/fsl-mph-dr-of.c index 44a7e58a26e3..e5df17522892 100644 --- a/drivers/usb/host/fsl-mph-dr-of.c +++ b/drivers/usb/host/fsl-mph-dr-of.c @@ -112,6 +112,9 @@ static struct platform_device *fsl_usb2_device_register( goto error; } + pdev->dev.of_node = ofdev->dev.of_node; + pdev->dev.of_node_reused = true; + retval = platform_device_add(pdev); if (retval) goto error; From 716b10580283fda66f2b88140e3964f8a7f9da89 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 7 Jul 2022 10:25:57 +0200 Subject: [PATCH 047/248] tty: extract tty_flip_buffer_commit() from tty_flip_buffer_push() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We will need this new helper in the next patch. Cc: Hillf Danton Cc: 一只狗 Cc: Dan Carpenter Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20220707082558.9250-1-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_buffer.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index bfa431a8e690..303a26c1b821 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -532,6 +532,15 @@ static void flush_to_ldisc(struct work_struct *work) } +static inline void tty_flip_buffer_commit(struct tty_buffer *tail) +{ + /* + * Paired w/ acquire in flush_to_ldisc(); ensures flush_to_ldisc() sees + * buffer data. + */ + smp_store_release(&tail->commit, tail->used); +} + /** * tty_flip_buffer_push - push terminal buffers * @port: tty port to push @@ -546,11 +555,7 @@ void tty_flip_buffer_push(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; - /* - * Paired w/ acquire in flush_to_ldisc(); ensures flush_to_ldisc() sees - * buffer data. - */ - smp_store_release(&buf->tail->commit, buf->tail->used); + tty_flip_buffer_commit(buf->tail); queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL(tty_flip_buffer_push); From a501ab75e7624d133a5a3c7ec010687c8b961d23 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 7 Jul 2022 10:25:58 +0200 Subject: [PATCH 048/248] tty: use new tty_insert_flip_string_and_push_buffer() in pty_write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a race in pty_write(). pty_write() can be called in parallel with e.g. ioctl(TIOCSTI) or ioctl(TCXONC) which also inserts chars to the buffer. Provided, tty_flip_buffer_push() in pty_write() is called outside the lock, it can commit inconsistent tail. This can lead to out of bounds writes and other issues. See the Link below. To fix this, we have to introduce a new helper called tty_insert_flip_string_and_push_buffer(). It does both tty_insert_flip_string() and tty_flip_buffer_commit() under the port lock. It also calls queue_work(), but outside the lock. See 71a174b39f10 (pty: do tty_flip_buffer_push without port->lock in pty_write) for the reasons. Keep the helper internal-only (in drivers' tty.h). It is not intended to be used widely. Link: https://seclists.org/oss-sec/2022/q2/155 Fixes: 71a174b39f10 (pty: do tty_flip_buffer_push without port->lock in pty_write) Cc: 一只狗 Cc: Dan Carpenter Suggested-by: Hillf Danton Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20220707082558.9250-2-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- drivers/tty/pty.c | 14 ++------------ drivers/tty/tty.h | 3 +++ drivers/tty/tty_buffer.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 74bfabe5b453..752dab3356d7 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -111,21 +111,11 @@ static void pty_unthrottle(struct tty_struct *tty) static int pty_write(struct tty_struct *tty, const unsigned char *buf, int c) { struct tty_struct *to = tty->link; - unsigned long flags; - if (tty->flow.stopped) + if (tty->flow.stopped || !c) return 0; - if (c > 0) { - spin_lock_irqsave(&to->port->lock, flags); - /* Stuff the data into the input queue of the other end */ - c = tty_insert_flip_string(to->port, buf, c); - spin_unlock_irqrestore(&to->port->lock, flags); - /* And shovel */ - if (c) - tty_flip_buffer_push(to->port); - } - return c; + return tty_insert_flip_string_and_push_buffer(to->port, buf, c); } /** diff --git a/drivers/tty/tty.h b/drivers/tty/tty.h index b710c5ef89ab..f310a8274df1 100644 --- a/drivers/tty/tty.h +++ b/drivers/tty/tty.h @@ -111,4 +111,7 @@ static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch) ssize_t redirected_tty_write(struct kiocb *, struct iov_iter *); +int tty_insert_flip_string_and_push_buffer(struct tty_port *port, + const unsigned char *chars, size_t cnt); + #endif diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index 303a26c1b821..595d8b49c745 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -560,6 +560,37 @@ void tty_flip_buffer_push(struct tty_port *port) } EXPORT_SYMBOL(tty_flip_buffer_push); +/** + * tty_insert_flip_string_and_push_buffer - add characters to the tty buffer and + * push + * @port: tty port + * @chars: characters + * @size: size + * + * The function combines tty_insert_flip_string() and tty_flip_buffer_push() + * with the exception of properly holding the @port->lock. + * + * To be used only internally (by pty currently). + * + * Returns: the number added. + */ +int tty_insert_flip_string_and_push_buffer(struct tty_port *port, + const unsigned char *chars, size_t size) +{ + struct tty_bufhead *buf = &port->buf; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + size = tty_insert_flip_string(port, chars, size); + if (size) + tty_flip_buffer_commit(buf->tail); + spin_unlock_irqrestore(&port->lock, flags); + + queue_work(system_unbound_wq, &buf->work); + + return size; +} + /** * tty_buffer_init - prepare a tty buffer structure * @port: tty port to initialise From 2fdf15b50a46e366740df4cccbe2343269b4ff55 Mon Sep 17 00:00:00 2001 From: Ryan Wanner Date: Thu, 7 Jul 2022 14:58:12 -0700 Subject: [PATCH 049/248] ARM: dts: at91: sama5d2: Fix typo in i2s1 node Fix typo in i2s1 causing errors in dt binding validation. Change assigned-parrents to assigned-clock-parents to match i2s0 node formatting. Fixes: 1ca81883c557 ("ARM: dts: at91: sama5d2: add nodes for I2S controllers") Signed-off-by: Ryan Wanner [claudiu.beznea: use imperative addressing in commit description, remove blank line after fixes tag, fix typo in commit message] Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220707215812.193008-1-Ryan.Wanner@microchip.com --- arch/arm/boot/dts/sama5d2.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi index 89c71d419f82..659a17fc755c 100644 --- a/arch/arm/boot/dts/sama5d2.dtsi +++ b/arch/arm/boot/dts/sama5d2.dtsi @@ -1124,7 +1124,7 @@ clocks = <&pmc PMC_TYPE_PERIPHERAL 55>, <&pmc PMC_TYPE_GCK 55>; clock-names = "pclk", "gclk"; assigned-clocks = <&pmc PMC_TYPE_CORE PMC_I2S1_MUX>; - assigned-parrents = <&pmc PMC_TYPE_GCK 55>; + assigned-clock-parents = <&pmc PMC_TYPE_GCK 55>; status = "disabled"; }; From 884b66976a7279ee889ba885fe364244d50b79e7 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 8 Jul 2022 19:45:29 +0200 Subject: [PATCH 050/248] ARM: dts: sunxi: Fix SPI NOR campatible on Orange Pi Zero The device tree should include generic "jedec,spi-nor" compatible, and a manufacturer-specific one. The macronix part is what is shipped on the boards that come with a flash chip. Fixes: 45857ae95478 ("ARM: dts: orange-pi-zero: add node for SPI NOR") Signed-off-by: Michal Suchanek Acked-by: Jernej Skrabec Signed-off-by: Jernej Skrabec Link: https://lore.kernel.org/r/20220708174529.3360-1-msuchanek@suse.de --- arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts index f19ed981da9d..3706216ffb40 100644 --- a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts +++ b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts @@ -169,7 +169,7 @@ flash@0 { #address-cells = <1>; #size-cells = <1>; - compatible = "mxicy,mx25l1606e", "winbond,w25q128"; + compatible = "mxicy,mx25l1606e", "jedec,spi-nor"; reg = <0>; spi-max-frequency = <40000000>; }; From b4a544e415e9be33b37d9bfa9d9f9f4d13f553d6 Mon Sep 17 00:00:00 2001 From: William Zhang Date: Fri, 8 Jul 2022 11:25:06 -0700 Subject: [PATCH 051/248] arm64: dts: broadcom: bcm4908: Fix timer node for BCM4906 SoC The cpu mask value in interrupt property inherits from bcm4908.dtsi which sets to four cpus. Correct the value to two cpus for dual core BCM4906 SoC. Fixes: c8b404fb05dc ("arm64: dts: broadcom: bcm4908: add BCM4906 Netgear R8000P DTS files") Signed-off-by: William Zhang Signed-off-by: Florian Fainelli --- arch/arm64/boot/dts/broadcom/bcm4908/bcm4906.dtsi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/boot/dts/broadcom/bcm4908/bcm4906.dtsi b/arch/arm64/boot/dts/broadcom/bcm4908/bcm4906.dtsi index 66023d553524..d084c33d5ca8 100644 --- a/arch/arm64/boot/dts/broadcom/bcm4908/bcm4906.dtsi +++ b/arch/arm64/boot/dts/broadcom/bcm4908/bcm4906.dtsi @@ -9,6 +9,14 @@ /delete-node/ cpu@3; }; + timer { + compatible = "arm,armv8-timer"; + interrupts = , + , + , + ; + }; + pmu { compatible = "arm,cortex-a53-pmu"; interrupts = , From 8bd582ae9a71d7f14c4e0c735b2eacaf7516d626 Mon Sep 17 00:00:00 2001 From: William Zhang Date: Fri, 8 Jul 2022 11:25:07 -0700 Subject: [PATCH 052/248] arm64: dts: broadcom: bcm4908: Fix cpu node for smp boot Add spin-table enable-method and cpu-release-addr properties for cpu0 node. This is required by all ARMv8 SoC. Otherwise some bootloader like u-boot can not update cpu-release-addr and linux fails to start up secondary cpus. Fixes: 2961f69f151c ("arm64: dts: broadcom: add BCM4908 and Asus GT-AC5300 early DTS files") Signed-off-by: William Zhang Signed-off-by: Florian Fainelli --- arch/arm64/boot/dts/broadcom/bcm4908/bcm4908.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/broadcom/bcm4908/bcm4908.dtsi b/arch/arm64/boot/dts/broadcom/bcm4908/bcm4908.dtsi index a4be040a00c0..967d2cd3c3ce 100644 --- a/arch/arm64/boot/dts/broadcom/bcm4908/bcm4908.dtsi +++ b/arch/arm64/boot/dts/broadcom/bcm4908/bcm4908.dtsi @@ -29,6 +29,8 @@ device_type = "cpu"; compatible = "brcm,brahma-b53"; reg = <0x0>; + enable-method = "spin-table"; + cpu-release-addr = <0x0 0xfff8>; next-level-cache = <&l2>; }; From 12dc6adc49c9c491d830504e1e01b688ef789093 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 8 Jul 2022 13:19:19 -0700 Subject: [PATCH 053/248] Input: wm97xx - make .remove() obviously always return 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit wm97xx_remove() returns zero unconditionally. To prepare changing the prototype for platform remove callbacks to return void, make it explicit that wm97xx_mfd_remove() always returns zero. The prototype for wm97xx_remove cannot be changed, as it's also used as a plain device remove callback. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20220708062718.240013-1-u.kleine-koenig@pengutronix.de Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/wm97xx-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/wm97xx-core.c b/drivers/input/touchscreen/wm97xx-core.c index 1b58611c8084..a466e8c56462 100644 --- a/drivers/input/touchscreen/wm97xx-core.c +++ b/drivers/input/touchscreen/wm97xx-core.c @@ -786,7 +786,9 @@ batt_err: static int wm97xx_mfd_remove(struct platform_device *pdev) { - return wm97xx_remove(&pdev->dev); + wm97xx_remove(&pdev->dev); + + return 0; } static int __maybe_unused wm97xx_suspend(struct device *dev) From 3de93e6ed2df6521e3f68fb45eec0bb4fe1bb218 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 8 Jul 2022 21:42:12 -0700 Subject: [PATCH 054/248] Input: goodix - call acpi_device_fix_up_power() in some cases On ACPI boards, when we cannot get the GPIOs to do a reset ourselves if necessary, call acpi_device_fix_up_power() to force the ACPI _PS0 method to run. On some devices without proper GPIO descriptions this will reset the touchscreen for us and this may be necessary for us to be able to communicate to the touchscreen at all. Specifically on an Aya Neo Next this change will cause the _PS0() ACPI function to call INIT() which does: Method (INIT, 0, Serialized) { TP_I = 0x00A50000 TP_R = 0x00A50000 Sleep (0x0A) TP_I = 0x00E50000 Sleep (One) TP_R = 0x00E50000 Sleep (0x06) TP_I = 0x00A50000 Sleep (0x3C) TP_I = 0x00041800 } On older kernels the ACPI core assumed a power-on was necessary by itself and would run _PS0 before our probe function runs, which can be seen from the GPIO pin ctrl registers in /sys/kernel/debug/gpio which match the above hex values with older kernels. With newer kernels before this change the GPIO pin ctrl registers do not match, indicating INIT() has not run and probing the touchscreen fails. This change makes Linux run _PS0() again fixing the touchscreen not working on the Aya Neo Next. Reported-and-tested-by: Maya Matuszczyk Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220618210233.208027-1-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/goodix.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c index 3ad9870db108..aa45a9fee6a0 100644 --- a/drivers/input/touchscreen/goodix.c +++ b/drivers/input/touchscreen/goodix.c @@ -900,6 +900,11 @@ static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts) } else { dev_warn(dev, "Unexpected ACPI resources: gpio_count %d, gpio_int_idx %d\n", ts->gpio_count, ts->gpio_int_idx); + /* + * On some devices _PS0 does a reset for us and + * sometimes this is necessary for things to work. + */ + acpi_device_fix_up_power(ACPI_COMPANION(dev)); return -EINVAL; } From 2a96271fb66c499e4a89d76a89d3d01170c10bef Mon Sep 17 00:00:00 2001 From: Siarhei Vishniakou Date: Fri, 8 Jul 2022 21:59:23 -0700 Subject: [PATCH 055/248] Input: document the units for resolution of size axes Today, the resolution of size axes is not documented. As a result, it's not clear what the canonical interpretation of this value should be. On Android, there is a need to calculate the size of the touch ellipse in physical units (millimeters). After reviewing linux source, it turned out that most of the existing usages are already interpreting this value as "units/mm". This documentation will make it explicit. This will help device implementations with correctly following the linux specs, and will ensure that the devices will work on Android without needing further customized parameters for scaling of major/minor values. Signed-off-by: Siarhei Vishniakou Reviewed-by: Jeff LaBundy Link: https://lore.kernel.org/r/20220520084514.3451193-1-svv@google.com Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index ee3127461ee0..328cf545c029 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -78,10 +78,13 @@ struct input_id { * Note that input core does not clamp reported values to the * [minimum, maximum] limits, such task is left to userspace. * - * The default resolution for main axes (ABS_X, ABS_Y, ABS_Z) - * is reported in units per millimeter (units/mm), resolution - * for rotational axes (ABS_RX, ABS_RY, ABS_RZ) is reported - * in units per radian. + * The default resolution for main axes (ABS_X, ABS_Y, ABS_Z, + * ABS_MT_POSITION_X, ABS_MT_POSITION_Y) is reported in units + * per millimeter (units/mm), resolution for rotational axes + * (ABS_RX, ABS_RY, ABS_RZ) is reported in units per radian. + * The resolution for the size axes (ABS_MT_TOUCH_MAJOR, + * ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MAJOR, ABS_MT_WIDTH_MINOR) + * is reported in units per millimeter (units/mm). * When INPUT_PROP_ACCELEROMETER is set the resolution changes. * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in * units per g (units/g) and in units per degree per second From 1968f2be5c03073c3f90d49226723eac4d431282 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Thu, 30 Jun 2022 10:33:23 +0530 Subject: [PATCH 056/248] platform/x86/amd/pmc: Add new acpi id for PMC controller New version of PMC controller will have a separate ACPI id, add that to the support list. Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20220630050324.3780654-1-Shyam-sundar.S-k@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd-pmc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c index f11d18beac18..73d6867cc20b 100644 --- a/drivers/platform/x86/amd-pmc.c +++ b/drivers/platform/x86/amd-pmc.c @@ -91,6 +91,7 @@ #define AMD_CPU_ID_PCO AMD_CPU_ID_RV #define AMD_CPU_ID_CZN AMD_CPU_ID_RN #define AMD_CPU_ID_YC 0x14B5 +#define AMD_CPU_ID_CB 0x14D8 #define PMC_MSG_DELAY_MIN_US 50 #define RESPONSE_REGISTER_LOOP_MAX 20000 @@ -318,6 +319,7 @@ static int amd_pmc_idlemask_read(struct amd_pmc_dev *pdev, struct device *dev, val = amd_pmc_reg_read(pdev, AMD_PMC_SCRATCH_REG_CZN); break; case AMD_CPU_ID_YC: + case AMD_CPU_ID_CB: val = amd_pmc_reg_read(pdev, AMD_PMC_SCRATCH_REG_YC); break; default: @@ -491,7 +493,7 @@ static void amd_pmc_dbgfs_register(struct amd_pmc_dev *dev) &amd_pmc_idlemask_fops); /* Enable STB only when the module_param is set */ if (enable_stb) { - if (dev->cpu_id == AMD_CPU_ID_YC) + if (dev->cpu_id == AMD_CPU_ID_YC || dev->cpu_id == AMD_CPU_ID_CB) debugfs_create_file("stb_read", 0644, dev->dbgfs_dir, dev, &amd_pmc_stb_debugfs_fops_v2); else @@ -615,6 +617,7 @@ static int amd_pmc_get_os_hint(struct amd_pmc_dev *dev) return MSG_OS_HINT_PCO; case AMD_CPU_ID_RN: case AMD_CPU_ID_YC: + case AMD_CPU_ID_CB: return MSG_OS_HINT_RN; } return -EINVAL; @@ -735,6 +738,7 @@ static struct acpi_s2idle_dev_ops amd_pmc_s2idle_dev_ops = { #endif static const struct pci_device_id pmc_pci_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_CB) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_YC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_CZN) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_RN) }, @@ -877,7 +881,7 @@ static int amd_pmc_probe(struct platform_device *pdev) mutex_init(&dev->lock); - if (enable_stb && dev->cpu_id == AMD_CPU_ID_YC) { + if (enable_stb && (dev->cpu_id == AMD_CPU_ID_YC || dev->cpu_id == AMD_CPU_ID_CB)) { err = amd_pmc_s2d_init(dev); if (err) return err; @@ -915,6 +919,7 @@ static const struct acpi_device_id amd_pmc_acpi_ids[] = { {"AMDI0005", 0}, {"AMDI0006", 0}, {"AMDI0007", 0}, + {"AMDI0008", 0}, {"AMD0004", 0}, {"AMD0005", 0}, { } From 4ddef52f26cfaf330240c93d7685a00628c66b04 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Thu, 30 Jun 2022 10:33:24 +0530 Subject: [PATCH 057/248] platform/x86/amd/pmc: Add new platform support PMC driver can be supported on a new upcoming platform. Add this information to the support list. Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20220630050324.3780654-2-Shyam-sundar.S-k@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd-pmc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c index 73d6867cc20b..700eb19e8450 100644 --- a/drivers/platform/x86/amd-pmc.c +++ b/drivers/platform/x86/amd-pmc.c @@ -92,6 +92,7 @@ #define AMD_CPU_ID_CZN AMD_CPU_ID_RN #define AMD_CPU_ID_YC 0x14B5 #define AMD_CPU_ID_CB 0x14D8 +#define AMD_CPU_ID_PS 0x14E8 #define PMC_MSG_DELAY_MIN_US 50 #define RESPONSE_REGISTER_LOOP_MAX 20000 @@ -320,6 +321,7 @@ static int amd_pmc_idlemask_read(struct amd_pmc_dev *pdev, struct device *dev, break; case AMD_CPU_ID_YC: case AMD_CPU_ID_CB: + case AMD_CPU_ID_PS: val = amd_pmc_reg_read(pdev, AMD_PMC_SCRATCH_REG_YC); break; default: @@ -493,7 +495,8 @@ static void amd_pmc_dbgfs_register(struct amd_pmc_dev *dev) &amd_pmc_idlemask_fops); /* Enable STB only when the module_param is set */ if (enable_stb) { - if (dev->cpu_id == AMD_CPU_ID_YC || dev->cpu_id == AMD_CPU_ID_CB) + if (dev->cpu_id == AMD_CPU_ID_YC || dev->cpu_id == AMD_CPU_ID_CB || + dev->cpu_id == AMD_CPU_ID_PS) debugfs_create_file("stb_read", 0644, dev->dbgfs_dir, dev, &amd_pmc_stb_debugfs_fops_v2); else @@ -618,6 +621,7 @@ static int amd_pmc_get_os_hint(struct amd_pmc_dev *dev) case AMD_CPU_ID_RN: case AMD_CPU_ID_YC: case AMD_CPU_ID_CB: + case AMD_CPU_ID_PS: return MSG_OS_HINT_RN; } return -EINVAL; @@ -738,6 +742,7 @@ static struct acpi_s2idle_dev_ops amd_pmc_s2idle_dev_ops = { #endif static const struct pci_device_id pmc_pci_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_PS) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_CB) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_YC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_CZN) }, From 5d62261a65698c1ee4e71f00963b269282015b1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A4r=20Eriksson?= Date: Tue, 5 Jul 2022 20:44:07 +0200 Subject: [PATCH 058/248] platform/x86: gigabyte-wmi: add support for B660I AORUS PRO DDR4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the B660I AORUS PRO DDR4. Signed-off-by: Pär Eriksson Link: https://lore.kernel.org/r/20220705184407.14181-1-parherman@gmail.com Signed-off-by: Hans de Goede --- drivers/platform/x86/gigabyte-wmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/gigabyte-wmi.c b/drivers/platform/x86/gigabyte-wmi.c index 497ad2f64a51..5e7e6659a849 100644 --- a/drivers/platform/x86/gigabyte-wmi.c +++ b/drivers/platform/x86/gigabyte-wmi.c @@ -150,6 +150,7 @@ static const struct dmi_system_id gigabyte_wmi_known_working_platforms[] = { DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550M AORUS PRO-P"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550M DS3H"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B660 GAMING X DDR4"), + DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B660I AORUS PRO DDR4"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("Z390 I AORUS PRO WIFI-CF"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("Z490 AORUS ELITE AC"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("X570 AORUS ELITE"), From 4ce8f4c2027db46299b450b28e9e116aaf00a757 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 8 Jul 2022 15:14:11 +0200 Subject: [PATCH 059/248] platform/x86: x86-android-tablets: Fix Lenovo Yoga Tablet 2 830/1050 poweroff again Commit 98f30d0ecf79 ("ACPI: power: Switch to sys-off handler API") switched the ACPI sleep code from directly setting the old global pm_power_off handler to using the new register_sys_off_handler() mechanism with a priority of SYS_OFF_PRIO_FIRMWARE. This is a problem in special cases where the old global pm_power_off handler later gets overwritten, such as the Lenovo Tab2 poweroff bugfix in x86-android-tablets. The old global pm_power_off handler gets run with a priority of SYS_OFF_PRIO_DEFAULT which is lower then SYS_OFF_PRIO_FIRMWARE, causing the troublesome ACPI poweroff (which freezes the system) to run first. Switch the registering of lenovo_yoga_tab2_830_1050_power_off over to register_sys_off_handler() with a priority of SYS_OFF_PRIO_FIRMWARE + 1 so that it will run before acpi_power_off() to fix this. Fixes: 98f30d0ecf79 ("ACPI: power: Switch to sys-off handler API") Cc: Dmitry Osipenko Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220708131412.81078-2-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index f446be72e539..480375977435 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -27,8 +27,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -889,6 +889,7 @@ static const struct pinctrl_map lenovo_yoga_tab2_830_1050_codec_pinctrl_map = "INT33FC:02", "pmu_clk2_grp", "pmu_clk"); static struct pinctrl *lenovo_yoga_tab2_830_1050_codec_pinctrl; +static struct sys_off_handler *lenovo_yoga_tab2_830_1050_sys_off_handler; static int __init lenovo_yoga_tab2_830_1050_init_codec(void) { @@ -933,9 +934,11 @@ err_put_device: * followed by a normal 3 second press to recover. Avoid this by doing an EFI * poweroff instead. */ -static void lenovo_yoga_tab2_830_1050_power_off(void) +static int lenovo_yoga_tab2_830_1050_power_off(struct sys_off_data *data) { efi.reset_system(EFI_RESET_SHUTDOWN, EFI_SUCCESS, 0, NULL); + + return NOTIFY_DONE; } static int __init lenovo_yoga_tab2_830_1050_init(void) @@ -950,13 +953,19 @@ static int __init lenovo_yoga_tab2_830_1050_init(void) if (ret) return ret; - pm_power_off = lenovo_yoga_tab2_830_1050_power_off; + /* SYS_OFF_PRIO_FIRMWARE + 1 so that it runs before acpi_power_off */ + lenovo_yoga_tab2_830_1050_sys_off_handler = + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, SYS_OFF_PRIO_FIRMWARE + 1, + lenovo_yoga_tab2_830_1050_power_off, NULL); + if (IS_ERR(lenovo_yoga_tab2_830_1050_sys_off_handler)) + return PTR_ERR(lenovo_yoga_tab2_830_1050_sys_off_handler); + return 0; } static void lenovo_yoga_tab2_830_1050_exit(void) { - pm_power_off = NULL; /* Just turn poweroff into halt on module unload */ + unregister_sys_off_handler(lenovo_yoga_tab2_830_1050_sys_off_handler); if (lenovo_yoga_tab2_830_1050_codec_pinctrl) { pinctrl_put(lenovo_yoga_tab2_830_1050_codec_pinctrl); From d40908f2621ea7abc6132ec8c5688a2960eeee3c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 8 Jul 2022 15:14:12 +0200 Subject: [PATCH 060/248] efi: Fix efi_power_off() not being run before acpi_power_off() when necessary Commit 98f30d0ecf79 ("ACPI: power: Switch to sys-off handler API") switched the ACPI sleep code from directly setting the old global pm_power_off handler to using the new register_sys_off_handler() mechanism with a priority of SYS_OFF_PRIO_FIRMWARE. This is a problem when the old global pm_power_off handler would later be overwritten, such as done by the late_initcall(efi_shutdown_init): if (efi_poweroff_required()) pm_power_off = efi_power_off; The old global pm_power_off handler gets run with a priority of SYS_OFF_PRIO_DEFAULT which is lower then SYS_OFF_PRIO_FIRMWARE, causing acpi_power_off() to run first, changing the behavior from before the ACPI sleep code switched to the new register_sys_off_handler(). Switch the registering of efi_power_off over to register_sys_off_handler() with a priority of SYS_OFF_PRIO_FIRMWARE + 1 so that it will run before acpi_power_off() as before. Note since the new sys-off-handler code will try all handlers in priority order, there is no more need for the EFI code to store and call the original pm_power_off handler. Fixes: 98f30d0ecf79 ("ACPI: power: Switch to sys-off handler API") Cc: Dmitry Osipenko Signed-off-by: Hans de Goede Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220708131412.81078-3-hdegoede@redhat.com --- drivers/firmware/efi/reboot.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/firmware/efi/reboot.c b/drivers/firmware/efi/reboot.c index 73089a24f04b..ceae84c19d22 100644 --- a/drivers/firmware/efi/reboot.c +++ b/drivers/firmware/efi/reboot.c @@ -6,7 +6,7 @@ #include #include -static void (*orig_pm_power_off)(void); +static struct sys_off_handler *efi_sys_off_handler; int efi_reboot_quirk_mode = -1; @@ -51,15 +51,11 @@ bool __weak efi_poweroff_required(void) return false; } -static void efi_power_off(void) +static int efi_power_off(struct sys_off_data *data) { efi.reset_system(EFI_RESET_SHUTDOWN, EFI_SUCCESS, 0, NULL); - /* - * The above call should not return, if it does fall back to - * the original power off method (typically ACPI poweroff). - */ - if (orig_pm_power_off) - orig_pm_power_off(); + + return NOTIFY_DONE; } static int __init efi_shutdown_init(void) @@ -68,8 +64,13 @@ static int __init efi_shutdown_init(void) return -ENODEV; if (efi_poweroff_required()) { - orig_pm_power_off = pm_power_off; - pm_power_off = efi_power_off; + /* SYS_OFF_PRIO_FIRMWARE + 1 so that it runs before acpi_power_off */ + efi_sys_off_handler = + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_FIRMWARE + 1, + efi_power_off, NULL); + if (IS_ERR(efi_sys_off_handler)) + return PTR_ERR(efi_sys_off_handler); } return 0; From f56e676a7f1ca7de9002526df3d2ee0e47dfd8ce Mon Sep 17 00:00:00 2001 From: Misaka19465 Date: Sun, 10 Jul 2022 19:37:27 +0800 Subject: [PATCH 061/248] platform/x86: asus-wmi: Add key mappings On laptops like ASUS TUF Gaming A15, which have hotkeys to start Armoury Crate or AURA Sync, these hotkeys are unavailable. This patch add mappings for them. Signed-off-by: Misaka19465 Link: https://lore.kernel.org/r/20220710113727.281634-1-misaka19465@olddoctor.net Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/asus-nb-wmi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index 57a07db659cb..478dd300b9c9 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -522,6 +522,7 @@ static const struct key_entry asus_nb_wmi_keymap[] = { { KE_KEY, 0x31, { KEY_VOLUMEDOWN } }, { KE_KEY, 0x32, { KEY_MUTE } }, { KE_KEY, 0x35, { KEY_SCREENLOCK } }, + { KE_KEY, 0x38, { KEY_PROG3 } }, /* Armoury Crate */ { KE_KEY, 0x40, { KEY_PREVIOUSSONG } }, { KE_KEY, 0x41, { KEY_NEXTSONG } }, { KE_KEY, 0x43, { KEY_STOPCD } }, /* Stop/Eject */ @@ -574,6 +575,7 @@ static const struct key_entry asus_nb_wmi_keymap[] = { { KE_KEY, 0xA5, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + TV + HDMI */ { KE_KEY, 0xA6, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + TV + HDMI */ { KE_KEY, 0xA7, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + CRT + TV + HDMI */ + { KE_KEY, 0xB3, { KEY_PROG4 } }, /* AURA */ { KE_KEY, 0xB5, { KEY_CALC } }, { KE_KEY, 0xC4, { KEY_KBDILLUMUP } }, { KE_KEY, 0xC5, { KEY_KBDILLUMDOWN } }, From c483e7ea10fa889f9da5012753a6766be6e11309 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 10 Jul 2022 16:07:36 +0200 Subject: [PATCH 062/248] platform/x86/intel/ifs: Mark as BROKEN A recent suggested change to the IFS code has shown that the userspace API needs a bit more work, see: https://lore.kernel.org/platform-driver-x86/20220708151938.986530-1-jithu.joseph@intel.com/ Mark it as BROKEN before 5.19 ships, to give ourselves one more kernel-devel cycle to get the userspace API right. Link: https://lore.kernel.org/platform-driver-x86/20220708151938.986530-1-jithu.joseph@intel.com/ Cc: Jithu Joseph Cc: Ashok Raj Cc: Tony Luck Suggested-by: Greg KH Signed-off-by: Hans de Goede Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220710140736.6492-1-hdegoede@redhat.com --- drivers/platform/x86/intel/ifs/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/platform/x86/intel/ifs/Kconfig b/drivers/platform/x86/intel/ifs/Kconfig index 7ce896434b8f..c341a27cc1a3 100644 --- a/drivers/platform/x86/intel/ifs/Kconfig +++ b/drivers/platform/x86/intel/ifs/Kconfig @@ -1,6 +1,9 @@ config INTEL_IFS tristate "Intel In Field Scan" depends on X86 && CPU_SUP_INTEL && 64BIT && SMP + # Discussion on the list has shown that the sysfs API needs a bit + # more work, mark this as broken for now + depends on BROKEN select INTEL_IFS_DEVICE help Enable support for the In Field Scan capability in select From b0d55983b2b885f6f96d6d6898d27a60bd9dc9a2 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 10 Jul 2022 19:36:58 +0200 Subject: [PATCH 063/248] platform/x86: intel_atomisp2_led: Also turn off the always-on camera LED on the Asus T100TAF Like the Asus T100TA the Asus T100TAF has a camera LED which is always on by default and both also use the same GPIO for the LED. Relax the DMI match for the Asus T100TA so that it also matches the T100TAF. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220710173658.221528-1-hdegoede@redhat.com --- drivers/platform/x86/intel/atomisp2/led.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/atomisp2/led.c b/drivers/platform/x86/intel/atomisp2/led.c index 5935dfca166f..10077a61d8c5 100644 --- a/drivers/platform/x86/intel/atomisp2/led.c +++ b/drivers/platform/x86/intel/atomisp2/led.c @@ -50,7 +50,8 @@ static const struct dmi_system_id atomisp2_led_systems[] __initconst = { { .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "T100TA"), + /* Non exact match to also match T100TAF */ + DMI_MATCH(DMI_PRODUCT_NAME, "T100TA"), }, .driver_data = &asus_t100ta_lookup, }, From 88573389aaa34640b9ecde15622c68e8737d8f8c Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 11 Jul 2022 09:29:51 +0530 Subject: [PATCH 064/248] riscv: Fix missing PAGE_PFN_MASK There are a bunch of functions that use the PFN from a page table entry that end up with the svpbmt upper-bits because they are missing the newly introduced PAGE_PFN_MASK which leads to wrong addresses conversions and then crash: fix this by adding this mask. Fixes: 100631b48ded ("riscv: Fix accessing pfn bits in PTEs for non-32bit variants") Signed-off-by: Alexandre Ghiti Signed-off-by: Anup Patel --- arch/riscv/include/asm/pgtable-64.h | 12 ++++++------ arch/riscv/include/asm/pgtable.h | 6 +++--- arch/riscv/kvm/mmu.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 5c2aba5efbd0..dc42375c2357 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -175,7 +175,7 @@ static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) static inline unsigned long _pud_pfn(pud_t pud) { - return pud_val(pud) >> _PAGE_PFN_SHIFT; + return __page_val_to_pfn(pud_val(pud)); } static inline pmd_t *pud_pgtable(pud_t pud) @@ -278,13 +278,13 @@ static inline p4d_t pfn_p4d(unsigned long pfn, pgprot_t prot) static inline unsigned long _p4d_pfn(p4d_t p4d) { - return p4d_val(p4d) >> _PAGE_PFN_SHIFT; + return __page_val_to_pfn(p4d_val(p4d)); } static inline pud_t *p4d_pgtable(p4d_t p4d) { if (pgtable_l4_enabled) - return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); + return (pud_t *)pfn_to_virt(__page_val_to_pfn(p4d_val(p4d))); return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); } @@ -292,7 +292,7 @@ static inline pud_t *p4d_pgtable(p4d_t p4d) static inline struct page *p4d_page(p4d_t p4d) { - return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); + return pfn_to_page(__page_val_to_pfn(p4d_val(p4d))); } #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) @@ -347,7 +347,7 @@ static inline void pgd_clear(pgd_t *pgd) static inline p4d_t *pgd_pgtable(pgd_t pgd) { if (pgtable_l5_enabled) - return (p4d_t *)pfn_to_virt(pgd_val(pgd) >> _PAGE_PFN_SHIFT); + return (p4d_t *)pfn_to_virt(__page_val_to_pfn(pgd_val(pgd))); return (p4d_t *)p4d_pgtable((p4d_t) { pgd_val(pgd) }); } @@ -355,7 +355,7 @@ static inline p4d_t *pgd_pgtable(pgd_t pgd) static inline struct page *pgd_page(pgd_t pgd) { - return pfn_to_page(pgd_val(pgd) >> _PAGE_PFN_SHIFT); + return pfn_to_page(__page_val_to_pfn(pgd_val(pgd))); } #define pgd_page(pgd) pgd_page(pgd) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 1d1be9d9419c..5dbd6610729b 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -261,7 +261,7 @@ static inline pgd_t pfn_pgd(unsigned long pfn, pgprot_t prot) static inline unsigned long _pgd_pfn(pgd_t pgd) { - return pgd_val(pgd) >> _PAGE_PFN_SHIFT; + return __page_val_to_pfn(pgd_val(pgd)); } static inline struct page *pmd_page(pmd_t pmd) @@ -590,14 +590,14 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd) return __pmd(pmd_val(pmd) & ~(_PAGE_PRESENT|_PAGE_PROT_NONE)); } -#define __pmd_to_phys(pmd) (pmd_val(pmd) >> _PAGE_PFN_SHIFT << PAGE_SHIFT) +#define __pmd_to_phys(pmd) (__page_val_to_pfn(pmd_val(pmd)) << PAGE_SHIFT) static inline unsigned long pmd_pfn(pmd_t pmd) { return ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT); } -#define __pud_to_phys(pud) (pud_val(pud) >> _PAGE_PFN_SHIFT << PAGE_SHIFT) +#define __pud_to_phys(pud) (__page_val_to_pfn(pud_val(pud)) << PAGE_SHIFT) static inline unsigned long pud_pfn(pud_t pud) { diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 1c00695ebee7..9826073fbc67 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -54,7 +54,7 @@ static inline unsigned long gstage_pte_index(gpa_t addr, u32 level) static inline unsigned long gstage_pte_page_vaddr(pte_t pte) { - return (unsigned long)pfn_to_virt(pte_val(pte) >> _PAGE_PFN_SHIFT); + return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte))); } static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) From be82abe6a76ba8e76f25312566182b0f13c4fbf9 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 11 Jul 2022 09:36:32 +0530 Subject: [PATCH 065/248] RISC-V: KVM: Fix SRCU deadlock caused by kvm_riscv_check_vcpu_requests() The kvm_riscv_check_vcpu_requests() is called with SRCU read lock held and for KVM_REQ_SLEEP request it will block the VCPU without releasing SRCU read lock. This causes KVM ioctls (such as KVM_IOEVENTFD) from other VCPUs of the same Guest/VM to hang/deadlock if there is any synchronize_srcu() or synchronize_srcu_expedited() in the path. To fix the above in kvm_riscv_check_vcpu_requests(), we should do SRCU read unlock before blocking the VCPU and do SRCU read lock after VCPU wakeup. Fixes: cce69aff689e ("RISC-V: KVM: Implement VCPU interrupts and requests handling") Reported-by: Bin Meng Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Heinrich Schuchardt Tested-by: Bin Meng Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 7f4ad5e4373a..f3455dc013fa 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -781,9 +781,11 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) { + kvm_vcpu_srcu_read_unlock(vcpu); rcuwait_wait_event(wait, (!vcpu->arch.power_off) && (!vcpu->arch.pause), TASK_INTERRUPTIBLE); + kvm_vcpu_srcu_read_lock(vcpu); if (vcpu->arch.power_off || vcpu->arch.pause) { /* From e87197fbd137c888fd6c871c72fe7e89445dd015 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 8 Jul 2022 11:41:06 +0300 Subject: [PATCH 066/248] drm/i915/gvt: IS_ERR() vs NULL bug in intel_gvt_update_reg_whitelist() The shmem_pin_map() function returns NULL, it doesn't return error pointers. Fixes: 97ea656521c8 ("drm/i915/gvt: Parse default state to update reg whitelist") Reviewed-by: Andrzej Hajda Signed-off-by: Dan Carpenter Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/Ysftoia2BPUyqVcD@kili Acked-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/cmd_parser.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c index c4118b808268..11971ee929f8 100644 --- a/drivers/gpu/drm/i915/gvt/cmd_parser.c +++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c @@ -3115,9 +3115,9 @@ void intel_gvt_update_reg_whitelist(struct intel_vgpu *vgpu) continue; vaddr = shmem_pin_map(engine->default_state); - if (IS_ERR(vaddr)) { - gvt_err("failed to map %s->default state, err:%zd\n", - engine->name, PTR_ERR(vaddr)); + if (!vaddr) { + gvt_err("failed to map %s->default state\n", + engine->name); return; } From 5e8afb8792f3b6ae7ccf700f8c19225382636401 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Tue, 5 Jul 2022 18:08:36 -0500 Subject: [PATCH 067/248] RDMA/irdma: Do not advertise 1GB page size for x722 x722 does not support 1GB page size but the irdma driver incorrectly advertises 1GB page size support for x722 device to ib_core to compute the best page size to use on this MR. This could lead to incorrect start offsets computed by hardware on the MR. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/i40iw_hw.c | 1 + drivers/infiniband/hw/irdma/icrdma_hw.c | 1 + drivers/infiniband/hw/irdma/irdma.h | 1 + drivers/infiniband/hw/irdma/verbs.c | 4 ++-- 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/irdma/i40iw_hw.c b/drivers/infiniband/hw/irdma/i40iw_hw.c index e46fc110004d..50299f58b6b3 100644 --- a/drivers/infiniband/hw/irdma/i40iw_hw.c +++ b/drivers/infiniband/hw/irdma/i40iw_hw.c @@ -201,6 +201,7 @@ void i40iw_init_hw(struct irdma_sc_dev *dev) dev->hw_attrs.uk_attrs.max_hw_read_sges = I40IW_MAX_SGE_RD; dev->hw_attrs.max_hw_device_pages = I40IW_MAX_PUSH_PAGE_COUNT; dev->hw_attrs.uk_attrs.max_hw_inline = I40IW_MAX_INLINE_DATA_SIZE; + dev->hw_attrs.page_size_cap = SZ_4K | SZ_2M; dev->hw_attrs.max_hw_ird = I40IW_MAX_IRD_SIZE; dev->hw_attrs.max_hw_ord = I40IW_MAX_ORD_SIZE; dev->hw_attrs.max_hw_wqes = I40IW_MAX_WQ_ENTRIES; diff --git a/drivers/infiniband/hw/irdma/icrdma_hw.c b/drivers/infiniband/hw/irdma/icrdma_hw.c index cf53b17510cd..5986fd906308 100644 --- a/drivers/infiniband/hw/irdma/icrdma_hw.c +++ b/drivers/infiniband/hw/irdma/icrdma_hw.c @@ -139,6 +139,7 @@ void icrdma_init_hw(struct irdma_sc_dev *dev) dev->cqp_db = dev->hw_regs[IRDMA_CQPDB]; dev->cq_ack_db = dev->hw_regs[IRDMA_CQACK]; dev->irq_ops = &icrdma_irq_ops; + dev->hw_attrs.page_size_cap = SZ_4K | SZ_2M | SZ_1G; dev->hw_attrs.max_hw_ird = ICRDMA_MAX_IRD_SIZE; dev->hw_attrs.max_hw_ord = ICRDMA_MAX_ORD_SIZE; dev->hw_attrs.max_stat_inst = ICRDMA_MAX_STATS_COUNT; diff --git a/drivers/infiniband/hw/irdma/irdma.h b/drivers/infiniband/hw/irdma/irdma.h index 46c12334c735..4789e85d717b 100644 --- a/drivers/infiniband/hw/irdma/irdma.h +++ b/drivers/infiniband/hw/irdma/irdma.h @@ -127,6 +127,7 @@ struct irdma_hw_attrs { u64 max_hw_outbound_msg_size; u64 max_hw_inbound_msg_size; u64 max_mr_size; + u64 page_size_cap; u32 min_hw_qp_id; u32 min_hw_aeq_size; u32 max_hw_aeq_size; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index c4412ece5a6d..96135a228f26 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -32,7 +32,7 @@ static int irdma_query_device(struct ib_device *ibdev, props->vendor_part_id = pcidev->device; props->hw_ver = rf->pcidev->revision; - props->page_size_cap = SZ_4K | SZ_2M | SZ_1G; + props->page_size_cap = hw_attrs->page_size_cap; props->max_mr_size = hw_attrs->max_mr_size; props->max_qp = rf->max_qp - rf->used_qps; props->max_qp_wr = hw_attrs->max_qp_wr; @@ -2781,7 +2781,7 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, if (req.reg_type == IRDMA_MEMREG_TYPE_MEM) { iwmr->page_size = ib_umem_find_best_pgsz(region, - SZ_4K | SZ_2M | SZ_1G, + iwdev->rf->sc_dev.hw_attrs.page_size_cap, virt); if (unlikely(!iwmr->page_size)) { kfree(iwmr); From cc0315564d6eec91c716d314b743321be24c70b3 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Tue, 5 Jul 2022 18:08:37 -0500 Subject: [PATCH 068/248] RDMA/irdma: Fix sleep from invalid context BUG Taking the qos_mutex to process RoCEv2 QP's on netdev events causes a kernel splat. Fix this by removing the handling for RoCEv2 in irdma_cm_teardown_connections that uses the mutex. This handling is only needed for iWARP to avoid having connections established while the link is down or having connections remain functional after the IP address is removed. BUG: sleeping function called from invalid context at kernel/locking/mutex. Call Trace: kernel: dump_stack+0x66/0x90 kernel: ___might_sleep.cold.92+0x8d/0x9a kernel: mutex_lock+0x1c/0x40 kernel: irdma_cm_teardown_connections+0x28e/0x4d0 [irdma] kernel: ? check_preempt_curr+0x7a/0x90 kernel: ? select_idle_sibling+0x22/0x3c0 kernel: ? select_task_rq_fair+0x94c/0xc90 kernel: ? irdma_exec_cqp_cmd+0xc27/0x17c0 [irdma] kernel: ? __wake_up_common+0x7a/0x190 kernel: irdma_if_notify+0x3cc/0x450 [irdma] kernel: ? sched_clock_cpu+0xc/0xb0 kernel: irdma_inet6addr_event+0xc6/0x150 [irdma] Fixes: 146b9756f14c ("RDMA/irdma: Add connection manager") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/cm.c | 50 -------------------------------- 1 file changed, 50 deletions(-) diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 638bf4a1ed94..646fa8677490 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -4231,10 +4231,6 @@ void irdma_cm_teardown_connections(struct irdma_device *iwdev, u32 *ipaddr, struct irdma_cm_node *cm_node; struct list_head teardown_list; struct ib_qp_attr attr; - struct irdma_sc_vsi *vsi = &iwdev->vsi; - struct irdma_sc_qp *sc_qp; - struct irdma_qp *qp; - int i; INIT_LIST_HEAD(&teardown_list); @@ -4251,52 +4247,6 @@ void irdma_cm_teardown_connections(struct irdma_device *iwdev, u32 *ipaddr, irdma_cm_disconn(cm_node->iwqp); irdma_rem_ref_cm_node(cm_node); } - if (!iwdev->roce_mode) - return; - - INIT_LIST_HEAD(&teardown_list); - for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) { - mutex_lock(&vsi->qos[i].qos_mutex); - list_for_each_safe (list_node, list_core_temp, - &vsi->qos[i].qplist) { - u32 qp_ip[4]; - - sc_qp = container_of(list_node, struct irdma_sc_qp, - list); - if (sc_qp->qp_uk.qp_type != IRDMA_QP_TYPE_ROCE_RC) - continue; - - qp = sc_qp->qp_uk.back_qp; - if (!disconnect_all) { - if (nfo->ipv4) - qp_ip[0] = qp->udp_info.local_ipaddr[3]; - else - memcpy(qp_ip, - &qp->udp_info.local_ipaddr[0], - sizeof(qp_ip)); - } - - if (disconnect_all || - (nfo->vlan_id == (qp->udp_info.vlan_tag & VLAN_VID_MASK) && - !memcmp(qp_ip, ipaddr, nfo->ipv4 ? 4 : 16))) { - spin_lock(&iwdev->rf->qptable_lock); - if (iwdev->rf->qp_table[sc_qp->qp_uk.qp_id]) { - irdma_qp_add_ref(&qp->ibqp); - list_add(&qp->teardown_entry, - &teardown_list); - } - spin_unlock(&iwdev->rf->qptable_lock); - } - } - mutex_unlock(&vsi->qos[i].qos_mutex); - } - - list_for_each_safe (list_node, list_core_temp, &teardown_list) { - qp = container_of(list_node, struct irdma_qp, teardown_entry); - attr.qp_state = IB_QPS_ERR; - irdma_modify_qp_roce(&qp->ibqp, &attr, IB_QP_STATE, NULL); - irdma_qp_rem_ref(&qp->ibqp); - } } /** From 166d3863231667c4f64dee72b77d1102cdfad11f Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sun, 10 Jul 2022 19:05:22 -0400 Subject: [PATCH 069/248] xen/gntdev: Ignore failure to unmap INVALID_GRANT_HANDLE The error paths of gntdev_mmap() can call unmap_grant_pages() even though not all of the pages have been successfully mapped. This will trigger the WARN_ON()s in __unmap_grant_pages_done(). The number of warnings can be very large; I have observed thousands of lines of warnings in the systemd journal. Avoid this problem by only warning on unmapping failure if the handle being unmapped is not INVALID_GRANT_HANDLE. The handle field of any page that was not successfully mapped will be INVALID_GRANT_HANDLE, so this catches all cases where unmapping can legitimately fail. Fixes: dbe97cff7dd9 ("xen/gntdev: Avoid blocking in unmap_grant_pages()") Cc: stable@vger.kernel.org Suggested-by: Juergen Gross Signed-off-by: Demi Marie Obenour Reviewed-by: Oleksandr Tyshchenko Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220710230522.1563-1-demi@invisiblethingslab.com Signed-off-by: Juergen Gross --- drivers/xen/gntdev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 4b56c39f766d..84b143eef395 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -396,13 +396,15 @@ static void __unmap_grant_pages_done(int result, unsigned int offset = data->unmap_ops - map->unmap_ops; for (i = 0; i < data->count; i++) { - WARN_ON(map->unmap_ops[offset+i].status); + WARN_ON(map->unmap_ops[offset + i].status != GNTST_okay && + map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); pr_debug("unmap handle=%d st=%d\n", map->unmap_ops[offset+i].handle, map->unmap_ops[offset+i].status); map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; if (use_ptemod) { - WARN_ON(map->kunmap_ops[offset+i].status); + WARN_ON(map->kunmap_ops[offset + i].status != GNTST_okay && + map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); pr_debug("kunmap handle=%u st=%d\n", map->kunmap_ops[offset+i].handle, map->kunmap_ops[offset+i].status); From e79b9473e9b59e83e23db9b7411c6080b5a6826d Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Fri, 8 Jul 2022 17:37:04 -0700 Subject: [PATCH 070/248] net: ipv4: fix clang -Wformat warnings When building with Clang we encounter these warnings: | net/ipv4/ah4.c:513:4: error: format specifies type 'unsigned short' but | the argument has type 'int' [-Werror,-Wformat] | aalg_desc->uinfo.auth.icv_fullbits / 8); - | net/ipv4/esp4.c:1114:5: error: format specifies type 'unsigned short' | but the argument has type 'int' [-Werror,-Wformat] | aalg_desc->uinfo.auth.icv_fullbits / 8); `aalg_desc->uinfo.auth.icv_fullbits` is a u16 but due to default argument promotion becomes an int. Variadic functions (printf-like) undergo default argument promotion. Documentation/core-api/printk-formats.rst specifically recommends using the promoted-to-type's format flag. As per C11 6.3.1.1: (https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1548.pdf) `If an int can represent all values of the original type ..., the value is converted to an int; otherwise, it is converted to an unsigned int. These are called the integer promotions.` Thus it makes sense to change %hu to %d not only to follow this standard but to suppress the warning as well. Link: https://github.com/ClangBuiltLinux/linux/issues/378 Signed-off-by: Justin Stitt Suggested-by: Joe Perches Suggested-by: Nathan Chancellor Suggested-by: Nick Desaulniers Signed-off-by: Steffen Klassert --- net/ipv4/ah4.c | 2 +- net/ipv4/esp4.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 6eea1e9e998d..f8ad04470d3a 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -507,7 +507,7 @@ static int ah_init_state(struct xfrm_state *x) if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { - pr_info("%s: %s digestsize %u != %hu\n", + pr_info("%s: %s digestsize %u != %u\n", __func__, x->aalg->alg_name, crypto_ahash_digestsize(ahash), aalg_desc->uinfo.auth.icv_fullbits / 8); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b21238df3301..b694f352ce7a 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -1108,7 +1108,7 @@ static int esp_init_authenc(struct xfrm_state *x) err = -EINVAL; if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - pr_info("ESP: %s digestsize %u != %hu\n", + pr_info("ESP: %s digestsize %u != %u\n", x->aalg->alg_name, crypto_aead_authsize(aead), aalg_desc->uinfo.auth.icv_fullbits / 8); From 7329b071729645e243b6207e76bca2f4951c991b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 12 Jul 2022 09:40:55 +0200 Subject: [PATCH 071/248] gpio: sim: fix the chip_name configfs item The chip_name configs attribute always displays the device name of the first GPIO bank because the logic of the relevant function is simply wrong. Fix it by correctly comparing the bank's swnode against the GPIO device's children. Fixes: cb8c474e79be ("gpio: sim: new testing module") Cc: stable@vger.kernel.org Reported-by: Kent Gibson Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Reviewed-by: Kent Gibson Tested-by: Kent Gibson --- drivers/gpio/gpio-sim.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index 98109839102f..1020c2feb249 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -991,28 +991,22 @@ static struct configfs_attribute *gpio_sim_device_config_attrs[] = { }; struct gpio_sim_chip_name_ctx { - struct gpio_sim_device *dev; + struct fwnode_handle *swnode; char *page; }; static int gpio_sim_emit_chip_name(struct device *dev, void *data) { struct gpio_sim_chip_name_ctx *ctx = data; - struct fwnode_handle *swnode; - struct gpio_sim_bank *bank; /* This would be the sysfs device exported in /sys/class/gpio. */ if (dev->class) return 0; - swnode = dev_fwnode(dev); + if (device_match_fwnode(dev, ctx->swnode)) + return sprintf(ctx->page, "%s\n", dev_name(dev)); - list_for_each_entry(bank, &ctx->dev->bank_list, siblings) { - if (bank->swnode == swnode) - return sprintf(ctx->page, "%s\n", dev_name(dev)); - } - - return -ENODATA; + return 0; } static ssize_t gpio_sim_bank_config_chip_name_show(struct config_item *item, @@ -1020,7 +1014,7 @@ static ssize_t gpio_sim_bank_config_chip_name_show(struct config_item *item, { struct gpio_sim_bank *bank = to_gpio_sim_bank(item); struct gpio_sim_device *dev = gpio_sim_bank_get_device(bank); - struct gpio_sim_chip_name_ctx ctx = { dev, page }; + struct gpio_sim_chip_name_ctx ctx = { bank->swnode, page }; int ret; mutex_lock(&dev->lock); From a77c46f2b4d48a81f36442ee0c2160baebf6c1a0 Mon Sep 17 00:00:00 2001 From: Han Xu Date: Mon, 11 Jul 2022 11:08:02 -0500 Subject: [PATCH 072/248] MAINTAINERS: change the NXP FSPI driver maintainer. Haibo Chen and me will take over the NXP FSPI driver maintainer role. Signed-off-by: Han Xu Link: https://lore.kernel.org/r/20220711160802.4938-1-han.xu@nxp.com Signed-off-by: Mark Brown --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a6d3bd9d2a8d..3dcedbf2abb5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14234,7 +14234,8 @@ S: Maintained F: drivers/net/phy/nxp-c45-tja11xx.c NXP FSPI DRIVER -M: Ashish Kumar +M: Han Xu +M: Haibo Chen R: Yogesh Gaur L: linux-spi@vger.kernel.org S: Maintained From 046cd8a2a9eec7c2b46b03958a2b6252ddff55b2 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Fri, 24 Jun 2022 06:04:06 -0700 Subject: [PATCH 073/248] drm/i915: fix a possible refcount leak in intel_dp_add_mst_connector() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If drm_connector_init fails, intel_connector_free will be called to take care of proper free. So it is necessary to drop the refcount of port before intel_connector_free. Fixes: 091a4f91942a ("drm/i915: Handle drm-layer errors in intel_dp_add_mst_connector") Signed-off-by: Hangyu Hua Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20220624130406.17996-1-jose.souza@intel.com Signed-off-by: José Roberto de Souza (cherry picked from commit cea9ed611e85d36a05db52b6457bf584b7d969e2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp_mst.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index 061b277e5ce7..14d2a64193b2 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -839,6 +839,7 @@ static struct drm_connector *intel_dp_add_mst_connector(struct drm_dp_mst_topolo ret = drm_connector_init(dev, connector, &intel_dp_mst_connector_funcs, DRM_MODE_CONNECTOR_DisplayPort); if (ret) { + drm_dp_mst_put_port_malloc(port); intel_connector_free(intel_connector); return NULL; } From 1391b9cfd35bb8f10785a17cb4bb5ea8d10faaae Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Tue, 21 Jun 2022 16:30:05 -0700 Subject: [PATCH 074/248] drm/i915/guc: ADL-N should use the same GuC FW as ADL-S The only difference between the ADL S and P GuC FWs is the HWConfig support. ADL-N does not support HWConfig, so we should use the same binary as ADL-S, otherwise the GuC might attempt to fetch a config table that does not exist. ADL-N is internally identified as an ADL-P, so we need to special-case it in the FW selection code. Fixes: 7e28d0b26759 ("drm/i915/adl-n: Enable ADL-N platform") Cc: John Harrison Cc: Tejas Upadhyay Cc: Anusha Srivatsa Cc: Jani Nikula Signed-off-by: Daniele Ceraolo Spurio Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20220621233005.3952293-1-daniele.ceraolospurio@intel.com (cherry picked from commit 971e4a9781742aaad1587e25fd5582b2dd595ef8) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c index f0d7b57b741e..2ff55b9994bc 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c @@ -162,6 +162,15 @@ __uc_fw_auto_select(struct drm_i915_private *i915, struct intel_uc_fw *uc_fw) u8 rev = INTEL_REVID(i915); int i; + /* + * The only difference between the ADL GuC FWs is the HWConfig support. + * ADL-N does not support HWConfig, so we should use the same binary as + * ADL-S, otherwise the GuC might attempt to fetch a config table that + * does not exist. + */ + if (IS_ADLP_N(i915)) + p = INTEL_ALDERLAKE_S; + GEM_BUG_ON(uc_fw->type >= ARRAY_SIZE(blobs_all)); fw_blobs = blobs_all[uc_fw->type].blobs; fw_count = blobs_all[uc_fw->type].count; From 48da0f67c53eecd2594c302be6c8a665b7740eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Mon, 20 Jun 2022 14:36:59 +0200 Subject: [PATCH 075/248] drm/i915: Fix vm use-after-free in vma destruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In vma destruction, the following race may occur: Thread 1: Thread 2: i915_vma_destroy(); ... list_del_init(vma->vm_link); ... mutex_unlock(vma->vm->mutex); __i915_vm_release(); release_references(); And in release_reference() we dereference vma->vm to get to the vm gt pointer, leading to a use-after free. However, __i915_vm_release() grabs the vm->mutex so the vm won't be destroyed before vma->vm->mutex is released, so extract the gt pointer under the vm->mutex to avoid the vma->vm dereference in release_references(). v2: Fix a typo in the commit message (Andi Shyti) Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5944 Fixes: e1a7ab4fca0c ("drm/i915: Remove the vm open count") Cc: Niranjana Vishwanathapura Cc: Matthew Auld Signed-off-by: Thomas Hellström Acked-by: Nirmoy Das Reviewed-by: Andrzej Hajda Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20220620123659.381772-1-thomas.hellstrom@linux.intel.com (cherry picked from commit 1926a6b75954fc1a8b44d10bd0c67db957b78cf7) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_vma.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 0bffb70b3c5f..04d12f278f57 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -1637,10 +1637,10 @@ static void force_unbind(struct i915_vma *vma) GEM_BUG_ON(drm_mm_node_allocated(&vma->node)); } -static void release_references(struct i915_vma *vma, bool vm_ddestroy) +static void release_references(struct i915_vma *vma, struct intel_gt *gt, + bool vm_ddestroy) { struct drm_i915_gem_object *obj = vma->obj; - struct intel_gt *gt = vma->vm->gt; GEM_BUG_ON(i915_vma_is_active(vma)); @@ -1695,11 +1695,12 @@ void i915_vma_destroy_locked(struct i915_vma *vma) force_unbind(vma); list_del_init(&vma->vm_link); - release_references(vma, false); + release_references(vma, vma->vm->gt, false); } void i915_vma_destroy(struct i915_vma *vma) { + struct intel_gt *gt; bool vm_ddestroy; mutex_lock(&vma->vm->mutex); @@ -1707,8 +1708,11 @@ void i915_vma_destroy(struct i915_vma *vma) list_del_init(&vma->vm_link); vm_ddestroy = vma->vm_ddestroy; vma->vm_ddestroy = false; + + /* vma->vm may be freed when releasing vma->vm->mutex. */ + gt = vma->vm->gt; mutex_unlock(&vma->vm->mutex); - release_references(vma, vm_ddestroy); + release_references(vma, gt, vm_ddestroy); } void i915_vma_parked(struct intel_gt *gt) From 896dcabd1f8f613c533d948df17408c41f8929f5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 8 Jul 2022 12:41:04 +0300 Subject: [PATCH 076/248] drm/i915/selftests: fix a couple IS_ERR() vs NULL tests The shmem_pin_map() function doesn't return error pointers, it returns NULL. Fixes: be1cb55a07bf ("drm/i915/gt: Keep a no-frills swappable copy of the default context state") Signed-off-by: Dan Carpenter Reviewed-by: Matthew Auld Signed-off-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20220708094104.GL2316@kadam (cherry picked from commit d50f5a109cf4ed50c5b575c1bb5fc3bd17b23308) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/selftest_lrc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index 8b2c11dbe354..1109088fe8f6 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -176,8 +176,8 @@ static int live_lrc_layout(void *arg) continue; hw = shmem_pin_map(engine->default_state); - if (IS_ERR(hw)) { - err = PTR_ERR(hw); + if (!hw) { + err = -ENOMEM; break; } hw += LRC_STATE_OFFSET / sizeof(*hw); @@ -365,8 +365,8 @@ static int live_lrc_fixed(void *arg) continue; hw = shmem_pin_map(engine->default_state); - if (IS_ERR(hw)) { - err = PTR_ERR(hw); + if (!hw) { + err = -ENOMEM; break; } hw += LRC_STATE_OFFSET / sizeof(*hw); From aff1e0b09b54b64944b7fe32997229552737b9e9 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Mon, 11 Jul 2022 09:58:59 +0100 Subject: [PATCH 077/248] drm/i915/ttm: fix sg_table construction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we encounter some monster sized local-memory page that exceeds the maximum sg length (UINT32_MAX), ensure that don't end up with some misaligned address in the entry that follows, leading to fireworks later. Also ensure we have some coverage of this in the selftests. v2(Chris): - Use round_down consistently to avoid udiv errors v3(Nirmoy): - Also update the max_segment in the selftest Fixes: f701b16d4cc5 ("drm/i915/ttm: add i915_sg_from_buddy_resource") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/6379 Signed-off-by: Matthew Auld Cc: Thomas Hellström Cc: Nirmoy Das Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20220711085859.24198-1-matthew.auld@intel.com (cherry picked from commit bc99f1209f19fefa3ee11e77464ccfae541f4291) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 11 ++++++++-- drivers/gpu/drm/i915/i915_scatterlist.c | 19 +++++++++++++---- drivers/gpu/drm/i915/i915_scatterlist.h | 6 ++++-- drivers/gpu/drm/i915/intel_region_ttm.c | 10 ++++++--- drivers/gpu/drm/i915/intel_region_ttm.h | 3 ++- .../drm/i915/selftests/intel_memory_region.c | 21 +++++++++++++++++-- drivers/gpu/drm/i915/selftests/mock_region.c | 3 ++- 7 files changed, 58 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index 4c25d9b2f138..d30ebcaec8b9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -620,10 +620,15 @@ i915_ttm_resource_get_st(struct drm_i915_gem_object *obj, struct ttm_resource *res) { struct ttm_buffer_object *bo = i915_gem_to_ttm(obj); + u64 page_alignment; if (!i915_ttm_gtt_binds_lmem(res)) return i915_ttm_tt_get_st(bo->ttm); + page_alignment = bo->page_alignment << PAGE_SHIFT; + if (!page_alignment) + page_alignment = obj->mm.region->min_page_size; + /* * If CPU mapping differs, we need to add the ttm_tt pages to * the resulting st. Might make sense for GGTT. @@ -634,7 +639,8 @@ i915_ttm_resource_get_st(struct drm_i915_gem_object *obj, struct i915_refct_sgt *rsgt; rsgt = intel_region_ttm_resource_to_rsgt(obj->mm.region, - res); + res, + page_alignment); if (IS_ERR(rsgt)) return rsgt; @@ -643,7 +649,8 @@ i915_ttm_resource_get_st(struct drm_i915_gem_object *obj, return i915_refct_sgt_get(obj->ttm.cached_io_rsgt); } - return intel_region_ttm_resource_to_rsgt(obj->mm.region, res); + return intel_region_ttm_resource_to_rsgt(obj->mm.region, res, + page_alignment); } static int i915_ttm_truncate(struct drm_i915_gem_object *obj) diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c index 159571b9bd24..f63b50b71e10 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.c +++ b/drivers/gpu/drm/i915/i915_scatterlist.c @@ -68,6 +68,7 @@ void i915_refct_sgt_init(struct i915_refct_sgt *rsgt, size_t size) * drm_mm_node * @node: The drm_mm_node. * @region_start: An offset to add to the dma addresses of the sg list. + * @page_alignment: Required page alignment for each sg entry. Power of two. * * Create a struct sg_table, initializing it from a struct drm_mm_node, * taking a maximum segment length into account, splitting into segments @@ -77,15 +78,18 @@ void i915_refct_sgt_init(struct i915_refct_sgt *rsgt, size_t size) * error code cast to an error pointer on failure. */ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, - u64 region_start) + u64 region_start, + u64 page_alignment) { - const u64 max_segment = SZ_1G; /* Do we have a limit on this? */ + const u64 max_segment = round_down(UINT_MAX, page_alignment); u64 segment_pages = max_segment >> PAGE_SHIFT; u64 block_size, offset, prev_end; struct i915_refct_sgt *rsgt; struct sg_table *st; struct scatterlist *sg; + GEM_BUG_ON(!max_segment); + rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL); if (!rsgt) return ERR_PTR(-ENOMEM); @@ -112,6 +116,8 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, sg = __sg_next(sg); sg_dma_address(sg) = region_start + offset; + GEM_BUG_ON(!IS_ALIGNED(sg_dma_address(sg), + page_alignment)); sg_dma_len(sg) = 0; sg->length = 0; st->nents++; @@ -138,6 +144,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, * i915_buddy_block list * @res: The struct i915_ttm_buddy_resource. * @region_start: An offset to add to the dma addresses of the sg list. + * @page_alignment: Required page alignment for each sg entry. Power of two. * * Create a struct sg_table, initializing it from struct i915_buddy_block list, * taking a maximum segment length into account, splitting into segments @@ -147,11 +154,12 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, * error code cast to an error pointer on failure. */ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, - u64 region_start) + u64 region_start, + u64 page_alignment) { struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(res); const u64 size = res->num_pages << PAGE_SHIFT; - const u64 max_segment = rounddown(UINT_MAX, PAGE_SIZE); + const u64 max_segment = round_down(UINT_MAX, page_alignment); struct drm_buddy *mm = bman_res->mm; struct list_head *blocks = &bman_res->blocks; struct drm_buddy_block *block; @@ -161,6 +169,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, resource_size_t prev_end; GEM_BUG_ON(list_empty(blocks)); + GEM_BUG_ON(!max_segment); rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL); if (!rsgt) @@ -191,6 +200,8 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, sg = __sg_next(sg); sg_dma_address(sg) = region_start + offset; + GEM_BUG_ON(!IS_ALIGNED(sg_dma_address(sg), + page_alignment)); sg_dma_len(sg) = 0; sg->length = 0; st->nents++; diff --git a/drivers/gpu/drm/i915/i915_scatterlist.h b/drivers/gpu/drm/i915/i915_scatterlist.h index 12c6a1684081..b13e4cdea923 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.h +++ b/drivers/gpu/drm/i915/i915_scatterlist.h @@ -213,9 +213,11 @@ static inline void __i915_refct_sgt_init(struct i915_refct_sgt *rsgt, void i915_refct_sgt_init(struct i915_refct_sgt *rsgt, size_t size); struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, - u64 region_start); + u64 region_start, + u64 page_alignment); struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, - u64 region_start); + u64 region_start, + u64 page_alignment); #endif diff --git a/drivers/gpu/drm/i915/intel_region_ttm.c b/drivers/gpu/drm/i915/intel_region_ttm.c index 62ff77445b01..6873808a7015 100644 --- a/drivers/gpu/drm/i915/intel_region_ttm.c +++ b/drivers/gpu/drm/i915/intel_region_ttm.c @@ -152,6 +152,7 @@ int intel_region_ttm_fini(struct intel_memory_region *mem) * Convert an opaque TTM resource manager resource to a refcounted sg_table. * @mem: The memory region. * @res: The resource manager resource obtained from the TTM resource manager. + * @page_alignment: Required page alignment for each sg entry. Power of two. * * The gem backends typically use sg-tables for operations on the underlying * io_memory. So provide a way for the backends to translate the @@ -161,16 +162,19 @@ int intel_region_ttm_fini(struct intel_memory_region *mem) */ struct i915_refct_sgt * intel_region_ttm_resource_to_rsgt(struct intel_memory_region *mem, - struct ttm_resource *res) + struct ttm_resource *res, + u64 page_alignment) { if (mem->is_range_manager) { struct ttm_range_mgr_node *range_node = to_ttm_range_mgr_node(res); return i915_rsgt_from_mm_node(&range_node->mm_nodes[0], - mem->region.start); + mem->region.start, + page_alignment); } else { - return i915_rsgt_from_buddy_resource(res, mem->region.start); + return i915_rsgt_from_buddy_resource(res, mem->region.start, + page_alignment); } } diff --git a/drivers/gpu/drm/i915/intel_region_ttm.h b/drivers/gpu/drm/i915/intel_region_ttm.h index cf9d86dcf409..98fba5155619 100644 --- a/drivers/gpu/drm/i915/intel_region_ttm.h +++ b/drivers/gpu/drm/i915/intel_region_ttm.h @@ -24,7 +24,8 @@ int intel_region_ttm_fini(struct intel_memory_region *mem); struct i915_refct_sgt * intel_region_ttm_resource_to_rsgt(struct intel_memory_region *mem, - struct ttm_resource *res); + struct ttm_resource *res, + u64 page_alignment); void intel_region_ttm_resource_free(struct intel_memory_region *mem, struct ttm_resource *res); diff --git a/drivers/gpu/drm/i915/selftests/intel_memory_region.c b/drivers/gpu/drm/i915/selftests/intel_memory_region.c index 73eb53edb8de..3b18e5905c86 100644 --- a/drivers/gpu/drm/i915/selftests/intel_memory_region.c +++ b/drivers/gpu/drm/i915/selftests/intel_memory_region.c @@ -451,7 +451,6 @@ out_put: static int igt_mock_max_segment(void *arg) { - const unsigned int max_segment = rounddown(UINT_MAX, PAGE_SIZE); struct intel_memory_region *mem = arg; struct drm_i915_private *i915 = mem->i915; struct i915_ttm_buddy_resource *res; @@ -460,7 +459,10 @@ static int igt_mock_max_segment(void *arg) struct drm_buddy *mm; struct list_head *blocks; struct scatterlist *sg; + I915_RND_STATE(prng); LIST_HEAD(objects); + unsigned int max_segment; + unsigned int ps; u64 size; int err = 0; @@ -472,7 +474,13 @@ static int igt_mock_max_segment(void *arg) */ size = SZ_8G; - mem = mock_region_create(i915, 0, size, PAGE_SIZE, 0, 0); + ps = PAGE_SIZE; + if (i915_prandom_u64_state(&prng) & 1) + ps = SZ_64K; /* For something like DG2 */ + + max_segment = round_down(UINT_MAX, ps); + + mem = mock_region_create(i915, 0, size, ps, 0, 0); if (IS_ERR(mem)) return PTR_ERR(mem); @@ -498,12 +506,21 @@ static int igt_mock_max_segment(void *arg) } for (sg = obj->mm.pages->sgl; sg; sg = sg_next(sg)) { + dma_addr_t daddr = sg_dma_address(sg); + if (sg->length > max_segment) { pr_err("%s: Created an oversized scatterlist entry, %u > %u\n", __func__, sg->length, max_segment); err = -EINVAL; goto out_close; } + + if (!IS_ALIGNED(daddr, ps)) { + pr_err("%s: Created an unaligned scatterlist entry, addr=%pa, ps=%u\n", + __func__, &daddr, ps); + err = -EINVAL; + goto out_close; + } } out_close: diff --git a/drivers/gpu/drm/i915/selftests/mock_region.c b/drivers/gpu/drm/i915/selftests/mock_region.c index 670557ce1024..bac21fe84ca5 100644 --- a/drivers/gpu/drm/i915/selftests/mock_region.c +++ b/drivers/gpu/drm/i915/selftests/mock_region.c @@ -33,7 +33,8 @@ static int mock_region_get_pages(struct drm_i915_gem_object *obj) return PTR_ERR(obj->mm.res); obj->mm.rsgt = intel_region_ttm_resource_to_rsgt(obj->mm.region, - obj->mm.res); + obj->mm.res, + obj->mm.region->min_page_size); if (IS_ERR(obj->mm.rsgt)) { err = PTR_ERR(obj->mm.rsgt); goto err_free_resource; From b24dcf1dc507f69ed3b5c66c2b6a0209ae80d4d4 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 12 Jul 2022 16:21:32 +0100 Subject: [PATCH 078/248] drm/i915/gt: Serialize GRDOM access between multiple engine resets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't allow two engines to be reset in parallel, as they would both try to select a reset bit (and send requests to common registers) and wait on that register, at the same time. Serialize control of the reset requests/acks using the uncore->lock, which will also ensure that no other GT state changes at the same time as the actual reset. Cc: stable@vger.kernel.org # v4.4 and upper Reported-by: Mika Kuoppala Signed-off-by: Chris Wilson Acked-by: Mika Kuoppala Reviewed-by: Andi Shyti Reviewed-by: Andrzej Hajda Acked-by: Thomas Hellström Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/e0a2d894e77aed7c2e36b0d1abdc7dbac3011729.1657639152.git.mchehab@kernel.org (cherry picked from commit 336561a914fc0c6f1218228718f633b31b7af1c3) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_reset.c | 37 ++++++++++++++++++++------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index a5338c3fde7a..c68d36fb5bbd 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -300,9 +300,9 @@ static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask) return err; } -static int gen6_reset_engines(struct intel_gt *gt, - intel_engine_mask_t engine_mask, - unsigned int retry) +static int __gen6_reset_engines(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) { struct intel_engine_cs *engine; u32 hw_mask; @@ -321,6 +321,20 @@ static int gen6_reset_engines(struct intel_gt *gt, return gen6_hw_domain_reset(gt, hw_mask); } +static int gen6_reset_engines(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(>->uncore->lock, flags); + ret = __gen6_reset_engines(gt, engine_mask, retry); + spin_unlock_irqrestore(>->uncore->lock, flags); + + return ret; +} + static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine) { int vecs_id; @@ -487,9 +501,9 @@ static void gen11_unlock_sfc(struct intel_engine_cs *engine) rmw_clear_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit); } -static int gen11_reset_engines(struct intel_gt *gt, - intel_engine_mask_t engine_mask, - unsigned int retry) +static int __gen11_reset_engines(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) { struct intel_engine_cs *engine; intel_engine_mask_t tmp; @@ -583,8 +597,11 @@ static int gen8_reset_engines(struct intel_gt *gt, struct intel_engine_cs *engine; const bool reset_non_ready = retry >= 1; intel_engine_mask_t tmp; + unsigned long flags; int ret; + spin_lock_irqsave(>->uncore->lock, flags); + for_each_engine_masked(engine, gt, engine_mask, tmp) { ret = gen8_engine_reset_prepare(engine); if (ret && !reset_non_ready) @@ -612,17 +629,19 @@ static int gen8_reset_engines(struct intel_gt *gt, * This is best effort, so ignore any error from the initial reset. */ if (IS_DG2(gt->i915) && engine_mask == ALL_ENGINES) - gen11_reset_engines(gt, gt->info.engine_mask, 0); + __gen11_reset_engines(gt, gt->info.engine_mask, 0); if (GRAPHICS_VER(gt->i915) >= 11) - ret = gen11_reset_engines(gt, engine_mask, retry); + ret = __gen11_reset_engines(gt, engine_mask, retry); else - ret = gen6_reset_engines(gt, engine_mask, retry); + ret = __gen6_reset_engines(gt, engine_mask, retry); skip_reset: for_each_engine_masked(engine, gt, engine_mask, tmp) gen8_engine_reset_cancel(engine); + spin_unlock_irqrestore(>->uncore->lock, flags); + return ret; } From a1c5a7bf79c1faa5633b918b5c0666545e84c4d1 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 12 Jul 2022 16:21:33 +0100 Subject: [PATCH 079/248] drm/i915/gt: Serialize TLB invalidates with GT resets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid trying to invalidate the TLB in the middle of performing an engine reset, as this may result in the reset timing out. Currently, the TLB invalidate is only serialised by its own mutex, forgoing the uncore lock, but we can take the uncore->lock as well to serialise the mmio access, thereby serialising with the GDRST. Tested on a NUC5i7RYB, BIOS RYBDWi35.86A.0380.2019.0517.1530 with i915 selftest/hangcheck. Cc: stable@vger.kernel.org # v4.4 and upper Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store") Reported-by: Mauro Carvalho Chehab Tested-by: Mauro Carvalho Chehab Reviewed-by: Mauro Carvalho Chehab Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Andi Shyti Acked-by: Thomas Hellström Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/1e59a7c45dd919a530256b9ac721ac6ea86c0677.1657639152.git.mchehab@kernel.org (cherry picked from commit 33da97894758737895e90c909f16786052680ef4) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_gt.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index 51a0fe60c050..531af6ad7007 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -1209,6 +1209,20 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt) mutex_lock(>->tlb_invalidate_lock); intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */ + + for_each_engine(engine, gt, id) { + struct reg_and_bit rb; + + rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); + if (!i915_mmio_reg_offset(rb.reg)) + continue; + + intel_uncore_write_fw(uncore, rb.reg, rb.bit); + } + + spin_unlock_irq(&uncore->lock); + for_each_engine(engine, gt, id) { /* * HW architecture suggest typical invalidation time at 40us, @@ -1223,7 +1237,6 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt) if (!i915_mmio_reg_offset(rb.reg)) continue; - intel_uncore_write_fw(uncore, rb.reg, rb.bit); if (__intel_wait_for_register_fw(uncore, rb.reg, rb.bit, 0, timeout_us, timeout_ms, From ad765fae792e16ce3c1d0b69ce939e3f7dba40ab Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 8 Jul 2022 16:20:11 +0200 Subject: [PATCH 080/248] drm/i915/gem: Look for waitboosting across the whole object prior to individual waits We employ a "waitboost" heuristic to detect when userspace is stalled waiting for results from earlier execution. Under latency sensitive work mixed between the gpu/cpu, the GPU is typically under-utilised and so RPS sees that low utilisation as a reason to downclock the frequency, causing longer stalls and lower throughput. The user left waiting for the results is not impressed. On applying commit 047a1b877ed4 ("dma-buf & drm/amdgpu: remove dma_resv workaround") it was observed that deinterlacing h264 on Haswell performance dropped by 2-5x. The reason being that the natural workload was not intense enough to trigger RPS (using HW evaluation intervals) to upclock, and so it was depending on waitboosting for the throughput. Commit 047a1b877ed4 ("dma-buf & drm/amdgpu: remove dma_resv workaround") changes the composition of dma-resv from keeping a single write fence + multiple read fences, to a single array of multiple write and read fences (a maximum of one pair of write/read fences per context). The iteration order was also changed implicitly from all-read fences then the single write fence, to a mix of write fences followed by read fences. It is that ordering change that belied the fragility of waitboosting. Currently, a waitboost is inspected at the point of waiting on an outstanding fence. If the GPU is backlogged such that we haven't yet stated the request we need to wait on, we force the GPU to upclock until the completion of that request. By changing the order in which we waited upon requests, we ended up waiting on those requests in sequence and as such we saw that each request was already started and so not a suitable candidate for waitboosting. Instead of asking whether to boost each fence in turn, we can look at whether boosting is required for the dma-resv ensemble prior to waiting on any fence, making the heuristic more robust to the order in which fences are stored in the dma-resv. Reported-by: Thomas Voegtle Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/6284 Fixes: 047a1b877ed4 ("dma-buf & drm/amdgpu: remove dma_resv workaround") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Signed-off-by: Karolina Drobnik Tested-by: Thomas Voegtle Reviewed-by: Andi Shyti Acked-by: Rodrigo Vivi Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/07e05518d9f6620d20cc1101ec1849203fe973f9.1657289332.git.karolina.drobnik@intel.com (cherry picked from commit 394e2b57a989113de494c52d4683444bcb02d4e1) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gem/i915_gem_wait.c | 34 ++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c b/drivers/gpu/drm/i915/gem/i915_gem_wait.c index 319936f91ac5..e6e01c2a74a6 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c @@ -9,6 +9,7 @@ #include #include "gt/intel_engine.h" +#include "gt/intel_rps.h" #include "i915_gem_ioctls.h" #include "i915_gem_object.h" @@ -31,6 +32,37 @@ i915_gem_object_wait_fence(struct dma_fence *fence, timeout); } +static void +i915_gem_object_boost(struct dma_resv *resv, unsigned int flags) +{ + struct dma_resv_iter cursor; + struct dma_fence *fence; + + /* + * Prescan all fences for potential boosting before we begin waiting. + * + * When we wait, we wait on outstanding fences serially. If the + * dma-resv contains a sequence such as 1:1, 1:2 instead of a reduced + * form 1:2, then as we look at each wait in turn we see that each + * request is currently executing and not worthy of boosting. But if + * we only happen to look at the final fence in the sequence (because + * of request coalescing or splitting between read/write arrays by + * the iterator), then we would boost. As such our decision to boost + * or not is delicately balanced on the order we wait on fences. + * + * So instead of looking for boosts sequentially, look for all boosts + * upfront and then wait on the outstanding fences. + */ + + dma_resv_iter_begin(&cursor, resv, + dma_resv_usage_rw(flags & I915_WAIT_ALL)); + dma_resv_for_each_fence_unlocked(&cursor, fence) + if (dma_fence_is_i915(fence) && + !i915_request_started(to_request(fence))) + intel_rps_boost(to_request(fence)); + dma_resv_iter_end(&cursor); +} + static long i915_gem_object_wait_reservation(struct dma_resv *resv, unsigned int flags, @@ -40,6 +72,8 @@ i915_gem_object_wait_reservation(struct dma_resv *resv, struct dma_fence *fence; long ret = timeout ?: 1; + i915_gem_object_boost(resv, flags); + dma_resv_iter_begin(&cursor, resv, dma_resv_usage_rw(flags & I915_WAIT_ALL)); dma_resv_for_each_fence_unlocked(&cursor, fence) { From 333991c4e66b3d4b5613315f18016da80344f659 Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Fri, 24 Jun 2022 13:35:28 +0200 Subject: [PATCH 081/248] drm/i915/selftests: fix subtraction overflow bug On some machines hole_end can be small enough to cause subtraction overflow. On the other side (addr + 2 * min_alignment) can overflow in case of mock tests. This patch should handle both cases. Fixes: e1c5f754067b59 ("drm/i915: Avoid overflow in computing pot_hole loop termination") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/3674 Signed-off-by: Andrzej Hajda Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20220624113528.2159210-1-andrzej.hajda@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit ab3edc679c552a466e4bf0b11af3666008bd65a2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c index 8633bec18fa7..ab9f17fc85bc 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c @@ -742,7 +742,7 @@ static int pot_hole(struct i915_address_space *vm, u64 addr; for (addr = round_up(hole_start + min_alignment, step) - min_alignment; - addr <= round_down(hole_end - (2 * min_alignment), step) - min_alignment; + hole_end > addr && hole_end - addr >= 2 * min_alignment; addr += step) { err = i915_vma_pin(vma, 0, 0, addr | flags); if (err) { From 7c239a071d1f04b7137789810807b4108d475c72 Mon Sep 17 00:00:00 2001 From: Lucien Buchmann Date: Sat, 25 Jun 2022 02:17:44 +0200 Subject: [PATCH 082/248] USB: serial: ftdi_sio: add Belimo device ids Those two product ids are known. Signed-off-by: Lucien Buchmann Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/ftdi_sio.c | 3 +++ drivers/usb/serial/ftdi_sio_ids.h | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index b440d338a895..d5a3986dfee7 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -1023,6 +1023,9 @@ static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(FTDI_VID, CHETCO_SEASMART_DISPLAY_PID) }, { USB_DEVICE(FTDI_VID, CHETCO_SEASMART_LITE_PID) }, { USB_DEVICE(FTDI_VID, CHETCO_SEASMART_ANALOG_PID) }, + /* Belimo Automation devices */ + { USB_DEVICE(FTDI_VID, BELIMO_ZTH_PID) }, + { USB_DEVICE(FTDI_VID, BELIMO_ZIP_PID) }, /* ICP DAS I-756xU devices */ { USB_DEVICE(ICPDAS_VID, ICPDAS_I7560U_PID) }, { USB_DEVICE(ICPDAS_VID, ICPDAS_I7561U_PID) }, diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index d1a9564697a4..4e92c165c86b 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -1568,6 +1568,12 @@ #define CHETCO_SEASMART_LITE_PID 0xA5AE /* SeaSmart Lite USB Adapter */ #define CHETCO_SEASMART_ANALOG_PID 0xA5AF /* SeaSmart Analog Adapter */ +/* + * Belimo Automation + */ +#define BELIMO_ZTH_PID 0x8050 +#define BELIMO_ZIP_PID 0xC811 + /* * Unjo AB */ From 68e3c69803dada336893640110cb87221bb01dcf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jul 2022 15:07:26 +0200 Subject: [PATCH 083/248] perf/core: Fix data race between perf_event_set_output() and perf_mmap_close() Yang Jihing reported a race between perf_event_set_output() and perf_mmap_close(): CPU1 CPU2 perf_mmap_close(e2) if (atomic_dec_and_test(&e2->rb->mmap_count)) // 1 - > 0 detach_rest = true ioctl(e1, IOC_SET_OUTPUT, e2) perf_event_set_output(e1, e2) ... list_for_each_entry_rcu(e, &e2->rb->event_list, rb_entry) ring_buffer_attach(e, NULL); // e1 isn't yet added and // therefore not detached ring_buffer_attach(e1, e2->rb) list_add_rcu(&e1->rb_entry, &e2->rb->event_list) After this; e1 is attached to an unmapped rb and a subsequent perf_mmap() will loop forever more: again: mutex_lock(&e->mmap_mutex); if (event->rb) { ... if (!atomic_inc_not_zero(&e->rb->mmap_count)) { ... mutex_unlock(&e->mmap_mutex); goto again; } } The loop in perf_mmap_close() holds e2->mmap_mutex, while the attach in perf_event_set_output() holds e1->mmap_mutex. As such there is no serialization to avoid this race. Change perf_event_set_output() to take both e1->mmap_mutex and e2->mmap_mutex to alleviate that problem. Additionally, have the loop in perf_mmap() detach the rb directly, this avoids having to wait for the concurrent perf_mmap_close() to get around to doing it to make progress. Fixes: 9bb5d40cd93c ("perf: Fix mmap() accounting hole") Reported-by: Yang Jihong Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yang Jihong Link: https://lkml.kernel.org/r/YsQ3jm2GR38SW7uD@worktop.programming.kicks-ass.net --- kernel/events/core.c | 45 ++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 80782cddb1da..d2b354991bf5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6253,10 +6253,10 @@ again: if (!atomic_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close() through - * perf_event_set_output(). Try again, hope for better - * luck. + * Raced against perf_mmap_close(); remove the + * event and try again. */ + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); goto again; } @@ -11825,14 +11825,25 @@ err_size: goto out; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { struct perf_buffer *rb = NULL; int ret = -EINVAL; - if (!output_event) + if (!output_event) { + mutex_lock(&event->mmap_mutex); goto set; + } /* don't allow circular references */ if (event == output_event) @@ -11870,8 +11881,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) event->pmu != output_event->pmu) goto out; + /* + * Hold both mmap_mutex to serialize against perf_mmap_close(). Since + * output_event is already on rb->event_list, and the list iteration + * restarts after every removal, it is guaranteed this new event is + * observed *OR* if output_event is already removed, it's guaranteed we + * observe !rb->mmap_count. + */ + mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: - mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock; @@ -11881,6 +11899,12 @@ set: rb = ring_buffer_get(output_event); if (!rb) goto unlock; + + /* did we race against perf_mmap_close() */ + if (!atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); + goto unlock; + } } ring_buffer_attach(event, rb); @@ -11888,20 +11912,13 @@ set: ret = 0; unlock: mutex_unlock(&event->mmap_mutex); + if (output_event) + mutex_unlock(&output_event->mmap_mutex); out: return ret; } -static void mutex_lock_double(struct mutex *a, struct mutex *b) -{ - if (b < a) - swap(a, b); - - mutex_lock(a); - mutex_lock_nested(b, SINGLE_DEPTH_NESTING); -} - static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; From 3131ef39fb03bbde237d0b8260445898f3dfda5b Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 13 Jul 2022 11:50:46 +0200 Subject: [PATCH 084/248] x86/asm/32: Fix ANNOTATE_UNRET_SAFE use on 32-bit The build on x86_32 currently fails after commit 9bb2ec608a20 (objtool: Update Retpoline validation) with: arch/x86/kernel/../../x86/xen/xen-head.S:35: Error: no such instruction: `annotate_unret_safe' ANNOTATE_UNRET_SAFE is defined in nospec-branch.h. And head_32.S is missing this include. Fix this. Fixes: 9bb2ec608a20 ("objtool: Update Retpoline validation") Signed-off-by: Jiri Slaby Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/63e23f80-033f-f64e-7522-2816debbc367@kernel.org --- arch/x86/kernel/head_32.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index eb8656bac99b..9b7acc9c7874 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include From 230ec83d4299b30c51a1c133b4f2a669972cc08a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 8 Jul 2022 15:14:56 +0200 Subject: [PATCH 085/248] x86/pat: Fix x86_has_pat_wp() x86_has_pat_wp() is using a wrong test, as it relies on the normal PAT configuration used by the kernel. In case the PAT MSR has been setup by another entity (e.g. Xen hypervisor) it might return false even if the PAT configuration is allowing WP mappings. This due to the fact that when running as Xen PV guest the PAT MSR is setup by the hypervisor and cannot be changed by the guest. This results in the WP related entry to be at a different position when running as Xen PV guest compared to the bare metal or fully virtualized case. The correct way to test for WP support is: 1. Get the PTE protection bits needed to select WP mode by reading __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP] (depending on the PAT MSR setting this might return protection bits for a stronger mode, e.g. UC-) 2. Translate those bits back into the real cache mode selected by those PTE bits by reading __pte2cachemode_tbl[__pte2cm_idx(prot)] 3. Test for the cache mode to be _PAGE_CACHE_MODE_WP Fixes: f88a68facd9a ("x86/mm: Extend early_memremap() support with additional attrs") Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov Cc: # 4.14 Link: https://lore.kernel.org/r/20220503132207.17234-1-jgross@suse.com --- arch/x86/mm/init.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d8cfce221275..57ba5502aecf 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -77,10 +77,20 @@ static uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; -/* Check that the write-protect PAT entry is set for write-protect */ +/* + * Check that the write-protect PAT entry is set for write-protect. + * To do this without making assumptions how PAT has been set up (Xen has + * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache + * mode via the __cachemode2pte_tbl[] into protection bits (those protection + * bits will select a cache mode of WP or better), and then translate the + * protection bits back into the cache mode using __pte2cm_idx() and the + * __pte2cachemode_tbl[] array. This will return the really used cache mode. + */ bool x86_has_pat_wp(void) { - return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP; + uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP]; + + return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP; } enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) From d2394860b45c3c1484e4b0a5d09909a1e3f6569e Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 13 Jul 2022 13:15:36 +0200 Subject: [PATCH 086/248] cpufreq: mediatek: Handle sram regulator probe deferral If the regulator_get_optional() call for the SRAM regulator returns a probe deferral, we must bail out and retry probing later: failing to do this will produce unstabilities on platforms requiring the handling for this regulator. Fixes: ffa7bdf7f344 ("cpufreq: mediatek: Make sram regulator optional") Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Viresh Kumar --- drivers/cpufreq/mediatek-cpufreq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c index 37a1eb20f5ba..76f6b3884e6b 100644 --- a/drivers/cpufreq/mediatek-cpufreq.c +++ b/drivers/cpufreq/mediatek-cpufreq.c @@ -439,9 +439,13 @@ static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu) /* Both presence and absence of sram regulator are valid cases. */ info->sram_reg = regulator_get_optional(cpu_dev, "sram"); - if (IS_ERR(info->sram_reg)) + if (IS_ERR(info->sram_reg)) { + ret = PTR_ERR(info->sram_reg); + if (ret == -EPROBE_DEFER) + goto out_free_resources; + info->sram_reg = NULL; - else { + } else { ret = regulator_enable(info->sram_reg); if (ret) { dev_warn(cpu_dev, "cpu%d: failed to enable vsram\n", cpu); From c4e789572557aa147b13bf7fe09cc99663ed0cf5 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 27 Jun 2022 14:50:53 +0200 Subject: [PATCH 087/248] s390/nospec: build expoline.o for modules_prepare target When CONFIG_EXPOLINE_EXTERN is used expoline thunks are generated from arch/s390/lib/expoline.S and postlinked into every module. This is also true for external modules. Add expoline.o build to the modules_prepare target. Fixes: 1d2ad084800e ("s390/nospec: add an option to use thunk-extern") Reported-by: Joe Lawrence Tested-by: Sumanth Korikkar Acked-by: Sumanth Korikkar Tested-by: C. Erastus Toe Tested-by: Joe Lawrence Link: https://lore.kernel.org/r/patch-1.thread-d13b6c.git-a2387a74dc49.your-ad-here.call-01656331067-ext-4899@work.hours Signed-off-by: Vasily Gorbik Signed-off-by: Alexander Gordeev --- arch/s390/Makefile | 8 +++++++- arch/s390/lib/Makefile | 3 ++- arch/s390/lib/expoline/Makefile | 3 +++ arch/s390/lib/{ => expoline}/expoline.S | 0 4 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 arch/s390/lib/expoline/Makefile rename arch/s390/lib/{ => expoline}/expoline.S (100%) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 495c68a4df1e..fc72a35a1f07 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -82,7 +82,7 @@ endif ifdef CONFIG_EXPOLINE ifdef CONFIG_EXPOLINE_EXTERN - KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline.o + KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline/expoline.o CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern else @@ -163,6 +163,12 @@ vdso_prepare: prepare0 $(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \ $(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h) + +ifdef CONFIG_EXPOLINE_EXTERN +modules_prepare: expoline_prepare +expoline_prepare: prepare0 + $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o +endif endif # Don't use tabs in echo arguments diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 5d415b3db6d1..580d2e3265cb 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -7,7 +7,6 @@ lib-y += delay.o string.o uaccess.o find.o spinlock.o obj-y += mem.o xor.o lib-$(CONFIG_KPROBES) += probes.o lib-$(CONFIG_UPROBES) += probes.o -obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o test_kprobes_s390-objs += test_kprobes_asm.o test_kprobes.o @@ -22,3 +21,5 @@ obj-$(CONFIG_S390_MODULES_SANITY_TEST) += test_modules.o obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + +obj-$(CONFIG_EXPOLINE_EXTERN) += expoline/ diff --git a/arch/s390/lib/expoline/Makefile b/arch/s390/lib/expoline/Makefile new file mode 100644 index 000000000000..854631d9cb03 --- /dev/null +++ b/arch/s390/lib/expoline/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += expoline.o diff --git a/arch/s390/lib/expoline.S b/arch/s390/lib/expoline/expoline.S similarity index 100% rename from arch/s390/lib/expoline.S rename to arch/s390/lib/expoline/expoline.S From a0b0987a781157263b82f4022649cf686d36c787 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 27 Jun 2022 14:50:56 +0200 Subject: [PATCH 088/248] s390/nospec: remove unneeded header includes Commit 4efd417f298b ("s390: raise minimum supported machine generation to z10") removed the usage of alternatives and lowcore in expolines macros. Remove unneeded header includes as well. With that, expoline.S doesn't require asm-offsets.h and expoline_prepare target dependency could be removed. Tested-by: Joe Lawrence Link: https://lore.kernel.org/r/patch-2.thread-d13b6c.git-d13b6c96fb5f.your-ad-here.call-01656331067-ext-4899@work.hours Signed-off-by: Vasily Gorbik Signed-off-by: Alexander Gordeev --- arch/s390/Makefile | 2 +- arch/s390/include/asm/nospec-insn.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index fc72a35a1f07..4cb5d17e7ead 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -166,7 +166,7 @@ vdso_prepare: prepare0 ifdef CONFIG_EXPOLINE_EXTERN modules_prepare: expoline_prepare -expoline_prepare: prepare0 +expoline_prepare: $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o endif endif diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h index d910d71b5bb5..7e9e99523e95 100644 --- a/arch/s390/include/asm/nospec-insn.h +++ b/arch/s390/include/asm/nospec-insn.h @@ -2,8 +2,6 @@ #ifndef _ASM_S390_NOSPEC_ASM_H #define _ASM_S390_NOSPEC_ASM_H -#include -#include #include #ifdef __ASSEMBLY__ From acea108fa067d140bd155161a79b1fcd967f4137 Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Wed, 6 Jul 2022 15:52:46 -0400 Subject: [PATCH 089/248] drm/amd/display: Ignore First MST Sideband Message Return Error [why] First MST sideband message returns AUX_RET_ERROR_HPD_DISCON on certain intel platform. Aux transaction considered failure if HPD unexpected pulled low. The actual aux transaction success in such case, hence do not return error. [how] Not returning error when AUX_RET_ERROR_HPD_DISCON detected on the first sideband message. v2: squash in additional DMI entries v3: squash in static fix Signed-off-by: Fangzhi Zuo Acked-by: Solomon Chiu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 39 +++++++++++++++++++ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 8 ++++ .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 17 ++++++++ 3 files changed, 64 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 1c2984bbda51..ff9820498cce 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include @@ -1382,6 +1383,41 @@ static bool dm_should_disable_stutter(struct pci_dev *pdev) return false; } +static const struct dmi_system_id hpd_disconnect_quirk_table[] = { + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision 3660"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision 3260"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision 3460"), + }, + }, + {} +}; + +static void retrieve_dmi_info(struct amdgpu_display_manager *dm) +{ + const struct dmi_system_id *dmi_id; + + dm->aux_hpd_discon_quirk = false; + + dmi_id = dmi_first_match(hpd_disconnect_quirk_table); + if (dmi_id) { + dm->aux_hpd_discon_quirk = true; + DRM_INFO("aux_hpd_discon_quirk attached\n"); + } +} + static int amdgpu_dm_init(struct amdgpu_device *adev) { struct dc_init_data init_data; @@ -1508,6 +1544,9 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) } INIT_LIST_HEAD(&adev->dm.da_list); + + retrieve_dmi_info(&adev->dm); + /* Display Core create. */ adev->dm.dc = dc_create(&init_data); diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index aa34c0068f41..e80ef93f6550 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -540,6 +540,14 @@ struct amdgpu_display_manager { * last successfully applied backlight values. */ u32 actual_brightness[AMDGPU_DM_MAX_NUM_EDP]; + + /** + * @aux_hpd_discon_quirk: + * + * quirk for hpd discon while aux is on-going. + * occurred on certain intel platform + */ + bool aux_hpd_discon_quirk; }; enum dsc_clock_force_state { diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 9221b6690a4a..2b9b095e5f03 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -56,6 +56,8 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, ssize_t result = 0; struct aux_payload payload; enum aux_return_code_type operation_result; + struct amdgpu_device *adev; + struct ddc_service *ddc; if (WARN_ON(msg->size > 16)) return -E2BIG; @@ -74,6 +76,21 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, result = dc_link_aux_transfer_raw(TO_DM_AUX(aux)->ddc_service, &payload, &operation_result); + /* + * w/a on certain intel platform where hpd is unexpected to pull low during + * 1st sideband message transaction by return AUX_RET_ERROR_HPD_DISCON + * aux transaction is succuess in such case, therefore bypass the error + */ + ddc = TO_DM_AUX(aux)->ddc_service; + adev = ddc->ctx->driver_context; + if (adev->dm.aux_hpd_discon_quirk) { + if (msg->address == DP_SIDEBAND_MSG_DOWN_REQ_BASE && + operation_result == AUX_RET_ERROR_HPD_DISCON) { + result = 0; + operation_result = AUX_RET_SUCCESS; + } + } + if (payload.write && result >= 0) result = msg->size; From c0044865480a162146b9dfe7783e73a08e97b2b9 Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Mon, 11 Jul 2022 16:03:08 +0800 Subject: [PATCH 090/248] drm/amdkfd: correct the MEC atomic support firmware checking for GC 10.3.7 On the GC 10.3.7 platform the initial MEC release version #3 can support atomic operation,so need correct and set its MEC atomic support version to #3. Signed-off-by: Prike Liang Reviewed-by: Aaron Liu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.18.x --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index bf4200457772..a08769c5e94b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -184,6 +184,8 @@ static void kfd_device_info_init(struct kfd_dev *kfd, /* Navi2x+, Navi1x+ */ if (gc_version == IP_VERSION(10, 3, 6)) kfd->device_info.no_atomic_fw_version = 14; + else if (gc_version == IP_VERSION(10, 3, 7)) + kfd->device_info.no_atomic_fw_version = 3; else if (gc_version >= IP_VERSION(10, 3, 0)) kfd->device_info.no_atomic_fw_version = 92; else if (gc_version >= IP_VERSION(10, 1, 1)) From add61d3c31de6a4b5e11a2ab96aaf4c873481568 Mon Sep 17 00:00:00 2001 From: Mario Kleiner Date: Mon, 11 Jul 2022 19:39:28 +0200 Subject: [PATCH 091/248] drm/amd/display: Only use depth 36 bpp linebuffers on DCN display engines. Various DCE versions had trouble with 36 bpp lb depth, requiring fixes, last time in commit 353ca0fa5630 ("drm/amd/display: Fix 10bit 4K display on CIK GPUs") for DCE-8. So far >= DCE-11.2 was considered ok, but now I found out that on DCE-11.2 it causes dithering when there shouldn't be any, so identity pixel passthrough with identity gamma LUTs doesn't work when it should. This breaks various important neuroscience applications, as reported to me by scientific users of Polaris cards under Ubuntu 22.04 with Linux 5.15, and confirmed by testing it myself on DCE-11.2. Lets only use depth 36 for DCN engines, where my testing showed that it is both necessary for high color precision output, e.g., RGBA16 fb's, and not harmful, as far as more than one year in real-world use showed. DCE engines seem to work fine for high precision output at 30 bpp, so this ("famous last words") depth 30 should hopefully fix all known problems without introducing new ones. Successfully retested on DCE-11.2 Polaris and DCN-1.0 Raven Ridge on top of Linux 5.19.0-rc2 + drm-next. Fixes: 353ca0fa5630 ("drm/amd/display: Fix 10bit 4K display on CIK GPUs") Signed-off-by: Mario Kleiner Tested-by: Mario Kleiner Cc: stable@vger.kernel.org # 5.14.0 Cc: Alex Deucher Cc: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index 6774dd8bb53e..3fe3fbac1e63 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -1117,12 +1117,13 @@ bool resource_build_scaling_params(struct pipe_ctx *pipe_ctx) * on certain displays, such as the Sharp 4k. 36bpp is needed * to support SURFACE_PIXEL_FORMAT_GRPH_ARGB16161616 and * SURFACE_PIXEL_FORMAT_GRPH_ABGR16161616 with actual > 10 bpc - * precision on at least DCN display engines. However, at least - * Carrizo with DCE_VERSION_11_0 does not like 36 bpp lb depth, - * so use only 30 bpp on DCE_VERSION_11_0. Testing with DCE 11.2 and 8.3 - * did not show such problems, so this seems to be the exception. + * precision on DCN display engines, but apparently not for DCE, as + * far as testing on DCE-11.2 and DCE-8 showed. Various DCE parts have + * problems: Carrizo with DCE_VERSION_11_0 does not like 36 bpp lb depth, + * neither do DCE-8 at 4k resolution, or DCE-11.2 (broken identify pixel + * passthrough). Therefore only use 36 bpp on DCN where it is actually needed. */ - if (plane_state->ctx->dce_version > DCE_VERSION_11_0) + if (plane_state->ctx->dce_version > DCE_VERSION_MAX) pipe_ctx->plane_res.scl_data.lb_params.depth = LB_PIXEL_DEPTH_36BPP; else pipe_ctx->plane_res.scl_data.lb_params.depth = LB_PIXEL_DEPTH_30BPP; From 0638c98c17aa12fe914459c82cd178247e21fb2b Mon Sep 17 00:00:00 2001 From: Yefim Barashkin Date: Mon, 11 Jul 2022 14:35:11 -0800 Subject: [PATCH 092/248] drm/amd/pm: Prevent divide by zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit divide error: 0000 [#1] SMP PTI CPU: 3 PID: 78925 Comm: tee Not tainted 5.15.50-1-lts #1 Hardware name: MSI MS-7A59/Z270 SLI PLUS (MS-7A59), BIOS 1.90 01/30/2018 RIP: 0010:smu_v11_0_set_fan_speed_rpm+0x11/0x110 [amdgpu] Speed is user-configurable through a file. I accidentally set it to zero, and the driver crashed. Reviewed-by: Evan Quan Reviewed-by: André Almeida Signed-off-by: Yefim Barashkin Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c index 5f8809f6990d..2fbd2926a531 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c @@ -1228,6 +1228,8 @@ int smu_v11_0_set_fan_speed_rpm(struct smu_context *smu, uint32_t crystal_clock_freq = 2500; uint32_t tach_period; + if (speed == 0) + return -EINVAL; /* * To prevent from possible overheat, some ASICs may have requirement * for minimum fan speed: From 47053b1e7382628dd30415685ae257f766a311e4 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Tue, 12 Jul 2022 10:32:39 -0100 Subject: [PATCH 093/248] drm/amd/display: correct check of coverage blend mode Check the value of per_pixel_alpha to decide whether the Coverage pixel blend mode is applicable or not. Fixes: 76818cdd11a2 ("drm/amd/display: add Coverage blend mode for overlay plane") Reported-by: kernel test robot Reported-by: Dan Carpenter Reviewed-by: Harry Wentland Signed-off-by: Melissa Wen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index ff9820498cce..d2e628358429 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -5446,7 +5446,7 @@ fill_blending_from_plane_state(const struct drm_plane_state *plane_state, } } - if (per_pixel_alpha && plane_state->pixel_blend_mode == DRM_MODE_BLEND_COVERAGE) + if (*per_pixel_alpha && plane_state->pixel_blend_mode == DRM_MODE_BLEND_COVERAGE) *pre_multiplied_alpha = false; } From 3283c83eb6fcfbda8ea03d7149d8e42e71c5d45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Mon, 11 Jul 2022 16:51:31 +0200 Subject: [PATCH 094/248] drm/amd/display: Ensure valid event timestamp for cursor-only commits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Requires enabling the vblank machinery for them. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2030 Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 43 +++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index d2e628358429..93ac33a8de9a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -463,6 +463,26 @@ static void dm_pflip_high_irq(void *interrupt_params) vrr_active, (int) !e); } +static void dm_crtc_handle_vblank(struct amdgpu_crtc *acrtc) +{ + struct drm_crtc *crtc = &acrtc->base; + struct drm_device *dev = crtc->dev; + unsigned long flags; + + drm_crtc_handle_vblank(crtc); + + spin_lock_irqsave(&dev->event_lock, flags); + + /* Send completion event for cursor-only commits */ + if (acrtc->event && acrtc->pflip_status != AMDGPU_FLIP_SUBMITTED) { + drm_crtc_send_vblank_event(crtc, acrtc->event); + drm_crtc_vblank_put(crtc); + acrtc->event = NULL; + } + + spin_unlock_irqrestore(&dev->event_lock, flags); +} + static void dm_vupdate_high_irq(void *interrupt_params) { struct common_irq_params *irq_params = interrupt_params; @@ -501,7 +521,7 @@ static void dm_vupdate_high_irq(void *interrupt_params) * if a pageflip happened inside front-porch. */ if (vrr_active) { - drm_crtc_handle_vblank(&acrtc->base); + dm_crtc_handle_vblank(acrtc); /* BTR processing for pre-DCE12 ASICs */ if (acrtc->dm_irq_params.stream && @@ -553,7 +573,7 @@ static void dm_crtc_high_irq(void *interrupt_params) * to dm_vupdate_high_irq after end of front-porch. */ if (!vrr_active) - drm_crtc_handle_vblank(&acrtc->base); + dm_crtc_handle_vblank(acrtc); /** * Following stuff must happen at start of vblank, for crc @@ -9174,6 +9194,7 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, struct amdgpu_bo *abo; uint32_t target_vblank, last_flip_vblank; bool vrr_active = amdgpu_dm_vrr_active(acrtc_state); + bool cursor_update = false; bool pflip_present = false; struct { struct dc_surface_update surface_updates[MAX_SURFACES]; @@ -9209,8 +9230,13 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, struct dm_plane_state *dm_new_plane_state = to_dm_plane_state(new_plane_state); /* Cursor plane is handled after stream updates */ - if (plane->type == DRM_PLANE_TYPE_CURSOR) + if (plane->type == DRM_PLANE_TYPE_CURSOR) { + if ((fb && crtc == pcrtc) || + (old_plane_state->fb && old_plane_state->crtc == pcrtc)) + cursor_update = true; + continue; + } if (!fb || !crtc || pcrtc != crtc) continue; @@ -9373,6 +9399,17 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, bundle->stream_update.vrr_infopacket = &acrtc_state->stream->vrr_infopacket; } + } else if (cursor_update && acrtc_state->active_planes > 0 && + !acrtc_state->force_dpms_off && + acrtc_attach->base.state->event) { + drm_crtc_vblank_get(pcrtc); + + spin_lock_irqsave(&pcrtc->dev->event_lock, flags); + + acrtc_attach->event = acrtc_attach->base.state->event; + acrtc_attach->base.state->event = NULL; + + spin_unlock_irqrestore(&pcrtc->dev->event_lock, flags); } /* Update the planes if changed or disable if we don't have any. */ From fbd74d16890b9f5d08ea69b5282b123c894f8860 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 13 Jul 2022 12:53:46 -0500 Subject: [PATCH 095/248] ACPI: CPPC: Fix enabling CPPC on AMD systems with shared memory When commit 72f2ecb7ece7 ("ACPI: bus: Set CPPC _OSC bits for all and when CPPC_LIB is supported") was introduced, we found collateral damage that a number of AMD systems that supported CPPC but didn't advertise support in _OSC stopped having a functional amd-pstate driver. The _OSC was only enforced on Intel systems at that time. This was fixed for the MSR based designs by commit 8b356e536e69f ("ACPI: CPPC: Don't require _OSC if X86_FEATURE_CPPC is supported") but some shared memory based designs also support CPPC but haven't advertised support in the _OSC. Add support for those designs as well by hardcoding the list of systems. Fixes: 72f2ecb7ece7 ("ACPI: bus: Set CPPC _OSC bits for all and when CPPC_LIB is supported") Fixes: 8b356e536e69f ("ACPI: CPPC: Don't require _OSC if X86_FEATURE_CPPC is supported") Link: https://lore.kernel.org/all/3559249.JlDtxWtqDm@natalenko.name/ Cc: 5.18+ # 5.18+ Reported-and-tested-by: Oleksandr Natalenko Signed-off-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/cppc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 734b96454896..8d8752b44f11 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -16,6 +16,12 @@ bool cpc_supported_by_cpu(void) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: case X86_VENDOR_HYGON: + if (boot_cpu_data.x86 == 0x19 && ((boot_cpu_data.x86_model <= 0x0f) || + (boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f))) + return true; + else if (boot_cpu_data.x86 == 0x17 && + boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f) + return true; return boot_cpu_has(X86_FEATURE_CPPC); } return false; From 8312cd3a7b835ae3033a679e5f0014a40e7891c5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 6 Jul 2022 20:59:42 +0800 Subject: [PATCH 096/248] scsi: megaraid: Clear READ queue map's nr_queues The megaraid SCSI driver sets set->nr_maps as 3 if poll_queues is > 0, and blk-mq actually initializes each map's nr_queues as nr_hw_queues. Consequently the driver has to clear READ queue map's nr_queues, otherwise the queue map becomes broken if poll_queues is set as non-zero. Link: https://lore.kernel.org/r/20220706125942.528533-1-ming.lei@redhat.com Fixes: 9e4bec5b2a23 ("scsi: megaraid_sas: mq_poll support") Cc: Kashyap Desai Cc: sumit.saxena@broadcom.com Cc: chandrakanth.patil@broadcom.com Cc: linux-block@vger.kernel.org Cc: Hannes Reinecke Reported-by: Guangwu Zhang Tested-by: Guangwu Zhang Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index c95360a3c186..0917b05059b4 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -3195,6 +3195,9 @@ static int megasas_map_queues(struct Scsi_Host *shost) qoff += map->nr_queues; offset += map->nr_queues; + /* we never use READ queue, so can't cheat blk-mq */ + shost->tag_set.map[HCTX_TYPE_READ].nr_queues = 0; + /* Setup Poll hctx */ map = &shost->tag_set.map[HCTX_TYPE_POLL]; map->nr_queues = instance->iopoll_q_count; From 2ae57c995003a7840cb6b5ec5f0c06193695321b Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 8 Jul 2022 17:00:27 -0700 Subject: [PATCH 097/248] scsi: ufs: core: Drop loglevel of WriteBoost message Commit '3b5f3c0d0548 ("scsi: ufs: core: Tidy up WB configuration code")' changed the log level of the write boost enable/disable notification from debug to info. This results in a lot of noise in the kernel log during normal operation. Drop it back to debug level to avoid this. Link: https://lore.kernel.org/r/20220709000027.3929970-1-bjorn.andersson@linaro.org Fixes: 3b5f3c0d0548 ("scsi: ufs: core: Tidy up WB configuration code") Reviewed-by: Alim Akhtar Acked-by: Bean Huo Signed-off-by: Bjorn Andersson Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index ce86d1b790c0..a11570b165a9 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -5738,7 +5738,7 @@ int ufshcd_wb_toggle(struct ufs_hba *hba, bool enable) } hba->dev_info.wb_enabled = enable; - dev_info(hba->dev, "%s Write Booster %s\n", + dev_dbg(hba->dev, "%s Write Booster %s\n", __func__, enable ? "enabled" : "disabled"); return ret; From 52a518019ca187227b786f8b8ee20869a97f3af4 Mon Sep 17 00:00:00 2001 From: Po-Wen Kao Date: Mon, 11 Jul 2022 22:42:23 +0800 Subject: [PATCH 098/248] scsi: ufs: core: Fix missing clk change notification on host reset In ufshcd_host_reset_and_restore(), ufshcd_set_clk_freq() is called to scale clock rate. However, this did not call vops->clk_scale_notify() to inform platform driver of clock change. Call ufshcd_scale_clks() instead so that clock change can be properly handled. Link: https://lore.kernel.org/r/20220711144224.17916-2-powen.kao@mediatek.com Reviewed-by: Bart Van Assche Reviewed-by: Stanley Chu Signed-off-by: Po-Wen Kao Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index a11570b165a9..c7b337480e3e 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -7253,7 +7253,7 @@ static int ufshcd_host_reset_and_restore(struct ufs_hba *hba) hba->silence_err_logs = false; /* scale up clocks to max frequency before full reinitialization */ - ufshcd_set_clk_freq(hba, true); + ufshcd_scale_clks(hba, true); err = ufshcd_hba_enable(hba); From e78276cadb669d3e55cffe66bd166ff3c8572e38 Mon Sep 17 00:00:00 2001 From: Changyuan Lyu Date: Thu, 7 Jul 2022 10:52:10 -0700 Subject: [PATCH 099/248] scsi: pm80xx: Fix 'Unknown' max/min linkrate Currently, the data flow of the max/min linkrate in the driver is * in pm8001_get_lrate_mode(): hardcoded value ==> struct sas_phy * in pm8001_bytes_dmaed(): struct pm8001_phy ==> struct sas_phy * in pm8001_phy_control(): libsas data ==> struct pm8001_phy Since pm8001_bytes_dmaed() follows pm8001_get_lrate_mode(), and the fields in struct pm8001_phy are not initialized, sysfs `/sys/class/sas_phy/phy-*/maximum_linkrate` always shows `Unknown`. To fix the issue, change the dataflow to the following: * in pm8001_phy_init(): initial value ==> struct pm8001_phy * in pm8001_get_lrate_mode(): struct pm8001_phy ==> struct sas_phy * in pm8001_phy_control(): libsas data ==> struct pm8001_phy For negotiated linkrate, the current dataflow is: * in pm8001_get_lrate_mode(): iomb data ==> struct asd_sas_phy ==> struct sas_phy * in pm8001_bytes_dmaed(): struct asd_sas_phy ==> struct sas_phy Since pm8001_bytes_dmaed() follows pm8001_get_lrate_mode(), the assignment statements in pm8001_bytes_dmaed() are unnecessary and cleaned up. Link: https://lore.kernel.org/r/20220707175210.528858-1-changyuanl@google.com Reviewed-by: Igor Pylypiv Acked-by: Jack Wang Signed-off-by: Changyuan Lyu Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_hwi.c | 19 +++---------------- drivers/scsi/pm8001/pm8001_init.c | 2 ++ 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index f7466a895d3b..991eb01bb1e0 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -3145,15 +3145,6 @@ void pm8001_bytes_dmaed(struct pm8001_hba_info *pm8001_ha, int i) if (!phy->phy_attached) return; - if (sas_phy->phy) { - struct sas_phy *sphy = sas_phy->phy; - sphy->negotiated_linkrate = sas_phy->linkrate; - sphy->minimum_linkrate = phy->minimum_linkrate; - sphy->minimum_linkrate_hw = SAS_LINK_RATE_1_5_GBPS; - sphy->maximum_linkrate = phy->maximum_linkrate; - sphy->maximum_linkrate_hw = phy->maximum_linkrate; - } - if (phy->phy_type & PORT_TYPE_SAS) { struct sas_identify_frame *id; id = (struct sas_identify_frame *)phy->frame_rcvd; @@ -3177,26 +3168,22 @@ void pm8001_get_lrate_mode(struct pm8001_phy *phy, u8 link_rate) switch (link_rate) { case PHY_SPEED_120: phy->sas_phy.linkrate = SAS_LINK_RATE_12_0_GBPS; - phy->sas_phy.phy->negotiated_linkrate = SAS_LINK_RATE_12_0_GBPS; break; case PHY_SPEED_60: phy->sas_phy.linkrate = SAS_LINK_RATE_6_0_GBPS; - phy->sas_phy.phy->negotiated_linkrate = SAS_LINK_RATE_6_0_GBPS; break; case PHY_SPEED_30: phy->sas_phy.linkrate = SAS_LINK_RATE_3_0_GBPS; - phy->sas_phy.phy->negotiated_linkrate = SAS_LINK_RATE_3_0_GBPS; break; case PHY_SPEED_15: phy->sas_phy.linkrate = SAS_LINK_RATE_1_5_GBPS; - phy->sas_phy.phy->negotiated_linkrate = SAS_LINK_RATE_1_5_GBPS; break; } sas_phy->negotiated_linkrate = phy->sas_phy.linkrate; - sas_phy->maximum_linkrate_hw = SAS_LINK_RATE_6_0_GBPS; + sas_phy->maximum_linkrate_hw = phy->maximum_linkrate; sas_phy->minimum_linkrate_hw = SAS_LINK_RATE_1_5_GBPS; - sas_phy->maximum_linkrate = SAS_LINK_RATE_6_0_GBPS; - sas_phy->minimum_linkrate = SAS_LINK_RATE_1_5_GBPS; + sas_phy->maximum_linkrate = phy->maximum_linkrate; + sas_phy->minimum_linkrate = phy->minimum_linkrate; } /** diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index 9b04f1a6a67d..01f2f41928eb 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -143,6 +143,8 @@ static void pm8001_phy_init(struct pm8001_hba_info *pm8001_ha, int phy_id) struct asd_sas_phy *sas_phy = &phy->sas_phy; phy->phy_state = PHY_LINK_DISABLE; phy->pm8001_ha = pm8001_ha; + phy->minimum_linkrate = SAS_LINK_RATE_1_5_GBPS; + phy->maximum_linkrate = SAS_LINK_RATE_6_0_GBPS; sas_phy->enabled = (phy_id < pm8001_ha->chip->n_phy) ? 1 : 0; sas_phy->class = SAS; sas_phy->iproto = SAS_PROTOCOL_ALL; From 355bf2e036c954317ddc4a9618b4f7e38ea5a970 Mon Sep 17 00:00:00 2001 From: Changyuan Lyu Date: Fri, 8 Jul 2022 13:50:26 -0700 Subject: [PATCH 100/248] scsi: pm80xx: Set stopped phy's linkrate to Disabled Negotiated link rate needs to be updated to 'Disabled' when phy is stopped. Link: https://lore.kernel.org/r/20220708205026.969161-1-changyuanl@google.com Reviewed-by: Igor Pylypiv Signed-off-by: Changyuan Lyu Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm80xx_hwi.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 01c5e8ff4cc5..303cd05fec50 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -3723,8 +3723,12 @@ static int mpi_phy_stop_resp(struct pm8001_hba_info *pm8001_ha, void *piomb) pm8001_dbg(pm8001_ha, MSG, "phy:0x%x status:0x%x\n", phyid, status); if (status == PHY_STOP_SUCCESS || - status == PHY_STOP_ERR_DEVICE_ATTACHED) + status == PHY_STOP_ERR_DEVICE_ATTACHED) { phy->phy_state = PHY_LINK_DISABLE; + phy->sas_phy.phy->negotiated_linkrate = SAS_PHY_DISABLED; + phy->sas_phy.linkrate = SAS_PHY_DISABLED; + } + return 0; } From 33a8573bdfeec5b746aedeea880733a4c7993158 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 14 Jul 2022 15:29:39 +0800 Subject: [PATCH 101/248] x86/bugs: Mark retbleed_strings static This symbol is not used outside of bugs.c, so mark it static. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220714072939.71162-1-jiapeng.chong@linux.alibaba.com --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0dd04713434b..3a0787a36910 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -793,7 +793,7 @@ enum retbleed_mitigation_cmd { RETBLEED_CMD_IBPB, }; -const char * const retbleed_strings[] = { +static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_NONE] = "Vulnerable", [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", From d16e0b26672066035439b2f49887f6576c4a3689 Mon Sep 17 00:00:00 2001 From: Alexandre Chartre Date: Wed, 13 Jul 2022 21:58:08 +0200 Subject: [PATCH 102/248] x86/entry: Remove UNTRAIN_RET from native_irq_return_ldt UNTRAIN_RET is not needed in native_irq_return_ldt because RET untraining has already been done at this point. In addition, when the RETBleed mitigation is IBPB, UNTRAIN_RET clobbers several registers (AX, CX, DX) so here it trashes user values which are in these registers. Signed-off-by: Alexandre Chartre Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/35b0d50f-12d1-10c3-f5e8-d6c140486d4a@oracle.com --- arch/x86/entry/entry_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 285e043a3e40..9953d966d124 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -727,7 +727,6 @@ native_irq_return_ldt: pushq %rdi /* Stash user RDI */ swapgs /* to kernel GS */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ - UNTRAIN_RET movq PER_CPU_VAR(espfix_waddr), %rdi movq %rax, (0*8)(%rdi) /* user RAX */ From fac47b43c760ea90e64b895dba60df0327be7775 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 11 Jul 2022 12:11:21 +0800 Subject: [PATCH 103/248] netfs: do not unlock and put the folio twice check_write_begin() will unlock and put the folio when return non-zero. So we should avoid unlocking and putting it twice in netfs layer. Change the way ->check_write_begin() works in the following two ways: (1) Pass it a pointer to the folio pointer, allowing it to unlock and put the folio prior to doing the stuff it wants to do, provided it clears the folio pointer. (2) Change the return values such that 0 with folio pointer set means continue, 0 with folio pointer cleared means re-get and all error codes indicating an error (no special treatment for -EAGAIN). [ bagasdotme: use Sphinx code text syntax for *foliop pointer ] Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/56423 Link: https://lore.kernel.org/r/cf169f43-8ee7-8697-25da-0204d1b4343e@redhat.com Co-developed-by: David Howells Signed-off-by: Xiubo Li Signed-off-by: David Howells Signed-off-by: Bagas Sanjaya Signed-off-by: Ilya Dryomov --- Documentation/filesystems/netfs_library.rst | 8 +++++--- fs/afs/file.c | 2 +- fs/ceph/addr.c | 11 ++++++----- fs/netfs/buffered_read.c | 17 ++++++++++------- include/linux/netfs.h | 2 +- 5 files changed, 23 insertions(+), 17 deletions(-) diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 4d19b19bcc08..73a4176144b3 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -301,7 +301,7 @@ through which it can issue requests and negotiate:: void (*issue_read)(struct netfs_io_subrequest *subreq); bool (*is_still_valid)(struct netfs_io_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, - struct folio *folio, void **_fsdata); + struct folio **foliop, void **_fsdata); void (*done)(struct netfs_io_request *rreq); }; @@ -381,8 +381,10 @@ The operations are as follows: allocated/grabbed the folio to be modified to allow the filesystem to flush conflicting state before allowing it to be modified. - It should return 0 if everything is now fine, -EAGAIN if the folio should be - regrabbed and any other error code to abort the operation. + It may unlock and discard the folio it was given and set the caller's folio + pointer to NULL. It should return 0 if everything is now fine (``*foliop`` + left set) or the op should be retried (``*foliop`` cleared) and any other + error code to abort the operation. * ``done`` diff --git a/fs/afs/file.c b/fs/afs/file.c index 42118a4f3383..d1cfb235c4b9 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -375,7 +375,7 @@ static int afs_begin_cache_operation(struct netfs_io_request *rreq) } static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6dee88815491..d6e5916138e4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -63,7 +63,7 @@ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata); + struct folio **foliop, void **_fsdata); static inline struct ceph_snap_context *page_snap_context(struct page *page) { @@ -1288,18 +1288,19 @@ ceph_find_incompatible(struct page *page) } static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(folio_page(folio, 0)); + snapc = ceph_find_incompatible(folio_page(*foliop, 0)); if (snapc) { int r; - folio_unlock(folio); - folio_put(folio); + folio_unlock(*foliop); + folio_put(*foliop); + *foliop = NULL; if (IS_ERR(snapc)) return PTR_ERR(snapc); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 42f892c5712e..0ce535852151 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -319,8 +319,9 @@ zero_out: * conflicting writes once the folio is grabbed and locked. It is passed a * pointer to the fsdata cookie that gets returned to the VM to be passed to * write_end. It is permitted to sleep. It should return 0 if the request - * should go ahead; unlock the folio and return -EAGAIN to cause the folio to - * be regot; or return an error. + * should go ahead or it may return an error. It may also unlock and put the + * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0 + * will cause the folio to be re-got and the process to be retried. * * The calling netfs must initialise a netfs context contiguous to the vfs * inode before calling this. @@ -348,13 +349,13 @@ retry: if (ctx->ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ - ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata); + ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata); if (ret < 0) { trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); - if (ret == -EAGAIN) - goto retry; goto error; } + if (!folio) + goto retry; } if (folio_test_uptodate(folio)) @@ -416,8 +417,10 @@ have_folio_no_wait: error_put: netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); error: - folio_unlock(folio); - folio_put(folio); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } _leave(" = %d", ret); return ret; } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 1773e5df8e65..1b18dfa52e48 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -214,7 +214,7 @@ struct netfs_request_ops { void (*issue_read)(struct netfs_io_subrequest *subreq); bool (*is_still_valid)(struct netfs_io_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, - struct folio *folio, void **_fsdata); + struct folio **foliop, void **_fsdata); void (*done)(struct netfs_io_request *rreq); }; From 564d998106397394b6aad260f219b882b3347e62 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2022 12:20:19 +0200 Subject: [PATCH 104/248] um: Add missing apply_returns() Implement apply_returns() stub for UM, just like all the other patching routines. Fixes: 15e67227c49a ("x86: Undo return-thunk damage") Reported-by: Randy Dunlap Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/Ys%2Ft45l%2FgarIrD0u@worktop.programming.kicks-ass.net --- arch/um/kernel/um_arch.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 0760e24f2eba..9838967d0b2f 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -432,6 +432,10 @@ void apply_retpolines(s32 *start, s32 *end) { } +void apply_returns(s32 *start, s32 *end) +{ +} + void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } From 99482726452bdf8be9325199022b17fa6d7d58fe Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 12 Jul 2022 15:50:09 +0200 Subject: [PATCH 105/248] KVM: nVMX: Always enable TSC scaling for L2 when it was enabled for L1 Windows 10/11 guests with Hyper-V role (WSL2) enabled are observed to hang upon boot or shortly after when a non-default TSC frequency was set for L1. The issue is observed on a host where TSC scaling is supported. The problem appears to be that Windows doesn't use TSC frequency for its guests even when the feature is advertised and KVM filters SECONDARY_EXEC_TSC_SCALING out when creating L2 controls from L1's. This leads to L2 running with the default frequency (matching host's) while L1 is running with an altered one. Keep SECONDARY_EXEC_TSC_SCALING in secondary exec controls for L2 when it was set for L1. TSC_MULTIPLIER is already correctly computed and written by prepare_vmcs02(). Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Message-Id: <20220712135009.952805-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f5cb18e00e78..67d629837672 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2278,7 +2278,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_DESC); if (nested_cpu_has(vmcs12, From 84e7051c0bc1f2a13101553959b3a9d9a8e24939 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 13 Jul 2022 14:12:41 -0300 Subject: [PATCH 106/248] x86/kvm: fix FASTOP_SIZE when return thunks are enabled The return thunk call makes the fastop functions larger, just like IBT does. Consider a 16-byte FASTOP_SIZE when CONFIG_RETHUNK is enabled. Otherwise, functions will be incorrectly aligned and when computing their position for differently sized operators, they will executed in the middle or end of a function, which may as well be an int3, leading to a crash like: [ 36.091116] int3: 0000 [#1] SMP NOPTI [ 36.091119] CPU: 3 PID: 1371 Comm: qemu-system-x86 Not tainted 5.15.0-41-generic #44 [ 36.091120] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 [ 36.091121] RIP: 0010:xaddw_ax_dx+0x9/0x10 [kvm] [ 36.091185] Code: 00 0f bb d0 c3 cc cc cc cc 48 0f bb d0 c3 cc cc cc cc 0f 1f 80 00 00 00 00 0f c0 d0 c3 cc cc cc cc 66 0f c1 d0 c3 cc cc cc cc <0f> 1f 80 00 00 00 00 0f c1 d0 c3 cc cc cc cc 48 0f c1 d0 c3 cc cc [ 36.091186] RSP: 0018:ffffb1f541143c98 EFLAGS: 00000202 [ 36.091188] RAX: 0000000089abcdef RBX: 0000000000000001 RCX: 0000000000000000 [ 36.091188] RDX: 0000000076543210 RSI: ffffffffc073c6d0 RDI: 0000000000000200 [ 36.091189] RBP: ffffb1f541143ca0 R08: ffff9f1803350a70 R09: 0000000000000002 [ 36.091190] R10: ffff9f1803350a70 R11: 0000000000000000 R12: ffff9f1803350a70 [ 36.091190] R13: ffffffffc077fee0 R14: 0000000000000000 R15: 0000000000000000 [ 36.091191] FS: 00007efdfce8d640(0000) GS:ffff9f187dd80000(0000) knlGS:0000000000000000 [ 36.091192] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 36.091192] CR2: 0000000000000000 CR3: 0000000009b62002 CR4: 0000000000772ee0 [ 36.091195] PKRU: 55555554 [ 36.091195] Call Trace: [ 36.091197] [ 36.091198] ? fastop+0x5a/0xa0 [kvm] [ 36.091222] x86_emulate_insn+0x7b8/0xe90 [kvm] [ 36.091244] x86_emulate_instruction+0x2f4/0x630 [kvm] [ 36.091263] ? kvm_arch_vcpu_load+0x7c/0x230 [kvm] [ 36.091283] ? vmx_prepare_switch_to_host+0xf7/0x190 [kvm_intel] [ 36.091290] complete_emulated_mmio+0x297/0x320 [kvm] [ 36.091310] kvm_arch_vcpu_ioctl_run+0x32f/0x550 [kvm] [ 36.091330] kvm_vcpu_ioctl+0x29e/0x6d0 [kvm] [ 36.091344] ? kvm_vcpu_ioctl+0x120/0x6d0 [kvm] [ 36.091357] ? __fget_files+0x86/0xc0 [ 36.091362] ? __fget_files+0x86/0xc0 [ 36.091363] __x64_sys_ioctl+0x92/0xd0 [ 36.091366] do_syscall_64+0x59/0xc0 [ 36.091369] ? syscall_exit_to_user_mode+0x27/0x50 [ 36.091370] ? do_syscall_64+0x69/0xc0 [ 36.091371] ? syscall_exit_to_user_mode+0x27/0x50 [ 36.091372] ? __x64_sys_writev+0x1c/0x30 [ 36.091374] ? do_syscall_64+0x69/0xc0 [ 36.091374] ? exit_to_user_mode_prepare+0x37/0xb0 [ 36.091378] ? syscall_exit_to_user_mode+0x27/0x50 [ 36.091379] ? do_syscall_64+0x69/0xc0 [ 36.091379] ? do_syscall_64+0x69/0xc0 [ 36.091380] ? do_syscall_64+0x69/0xc0 [ 36.091381] ? do_syscall_64+0x69/0xc0 [ 36.091381] entry_SYSCALL_64_after_hwframe+0x61/0xcb [ 36.091384] RIP: 0033:0x7efdfe6d1aff [ 36.091390] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <41> 89 c0 3d 00 f0 ff ff 77 1f 48 8b 44 24 18 64 48 2b 04 25 28 00 [ 36.091391] RSP: 002b:00007efdfce8c460 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 36.091393] RAX: ffffffffffffffda RBX: 000000000000ae80 RCX: 00007efdfe6d1aff [ 36.091393] RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 000000000000000c [ 36.091394] RBP: 0000558f1609e220 R08: 0000558f13fb8190 R09: 00000000ffffffff [ 36.091394] R10: 0000558f16b5e950 R11: 0000000000000246 R12: 0000000000000000 [ 36.091394] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000 [ 36.091396] [ 36.091397] Modules linked in: isofs nls_iso8859_1 kvm_intel joydev kvm input_leds serio_raw sch_fq_codel dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua ipmi_devintf ipmi_msghandler drm msr ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel virtio_net net_failover crypto_simd ahci xhci_pci cryptd psmouse virtio_blk libahci xhci_pci_renesas failover [ 36.123271] ---[ end trace db3c0ab5a48fabcc ]--- [ 36.123272] RIP: 0010:xaddw_ax_dx+0x9/0x10 [kvm] [ 36.123319] Code: 00 0f bb d0 c3 cc cc cc cc 48 0f bb d0 c3 cc cc cc cc 0f 1f 80 00 00 00 00 0f c0 d0 c3 cc cc cc cc 66 0f c1 d0 c3 cc cc cc cc <0f> 1f 80 00 00 00 00 0f c1 d0 c3 cc cc cc cc 48 0f c1 d0 c3 cc cc [ 36.123320] RSP: 0018:ffffb1f541143c98 EFLAGS: 00000202 [ 36.123321] RAX: 0000000089abcdef RBX: 0000000000000001 RCX: 0000000000000000 [ 36.123321] RDX: 0000000076543210 RSI: ffffffffc073c6d0 RDI: 0000000000000200 [ 36.123322] RBP: ffffb1f541143ca0 R08: ffff9f1803350a70 R09: 0000000000000002 [ 36.123322] R10: ffff9f1803350a70 R11: 0000000000000000 R12: ffff9f1803350a70 [ 36.123323] R13: ffffffffc077fee0 R14: 0000000000000000 R15: 0000000000000000 [ 36.123323] FS: 00007efdfce8d640(0000) GS:ffff9f187dd80000(0000) knlGS:0000000000000000 [ 36.123324] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 36.123325] CR2: 0000000000000000 CR3: 0000000009b62002 CR4: 0000000000772ee0 [ 36.123327] PKRU: 55555554 [ 36.123328] Kernel panic - not syncing: Fatal exception in interrupt [ 36.123410] Kernel Offset: 0x1400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 36.135305] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: aa3d480315ba ("x86: Use return-thunk in asm code") Signed-off-by: Thadeu Lima de Souza Cascardo Co-developed-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Paolo Bonzini Reported-by: Linux Kernel Functional Testing Message-Id: <20220713171241.184026-1-cascardo@canonical.com> Tested-by: Jack Wang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index db96bf7d1122..0a15b0fec6d9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -189,8 +189,12 @@ #define X8(x...) X4(x), X4(x) #define X16(x...) X8(x), X8(x) -#define NR_FASTOP (ilog2(sizeof(ulong)) + 1) -#define FASTOP_SIZE (8 * (1 + HAS_KERNEL_IBT)) +#define NR_FASTOP (ilog2(sizeof(ulong)) + 1) +#define RET_LENGTH (1 + (4 * IS_ENABLED(CONFIG_RETHUNK)) + \ + IS_ENABLED(CONFIG_SLS)) +#define FASTOP_LENGTH (ENDBR_INSN_SIZE + 7 + RET_LENGTH) +#define FASTOP_SIZE (8 << ((FASTOP_LENGTH > 8) & 1) << ((FASTOP_LENGTH > 16) & 1)) +static_assert(FASTOP_LENGTH <= FASTOP_SIZE); struct opcode { u64 flags; @@ -442,8 +446,6 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); * RET | JMP __x86_return_thunk [1,5 bytes; CONFIG_RETHUNK] * INT3 [1 byte; CONFIG_SLS] */ -#define RET_LENGTH (1 + (4 * IS_ENABLED(CONFIG_RETHUNK)) + \ - IS_ENABLED(CONFIG_SLS)) #define SETCC_LENGTH (ENDBR_INSN_SIZE + 3 + RET_LENGTH) #define SETCC_ALIGN (4 << ((SETCC_LENGTH > 4) & 1) << ((SETCC_LENGTH > 8) & 1)) static_assert(SETCC_LENGTH <= SETCC_ALIGN); From 1b870fa5573e260bc74d19f381ab0dd971a8d8e7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 14 Jul 2022 07:27:31 -0400 Subject: [PATCH 107/248] kvm: stats: tell userspace which values are boolean Some of the statistics values exported by KVM are always only 0 or 1. It can be useful to export this fact to userspace so that it can track them specially (for example by polling the value every now and then to compute a % of time spent in a specific state). Therefore, add "boolean value" as a new "unit". While it is not exactly a unit, it walks and quacks like one. In particular, using the type would be wrong because boolean values could be instantaneous or peak values (e.g. "is the rmap allocated?") or even two-bucket histograms (e.g. "number of posted vs. non-posted interrupt injections"). Suggested-by: Amneesh Singh Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 6 ++++++ arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 11 ++++++++++- include/uapi/linux/kvm.h | 1 + 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 11e00a46c610..48bf6e49a7de 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5657,6 +5657,7 @@ by a string of size ``name_size``. #define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) + #define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES #define KVM_STATS_BASE_SHIFT 8 @@ -5724,6 +5725,11 @@ Bits 4-7 of ``flags`` encode the unit: It indicates that the statistics data is used to measure time or latency. * ``KVM_STATS_UNIT_CYCLES`` It indicates that the statistics data is used to measure CPU clock cycles. + * ``KVM_STATS_UNIT_BOOLEAN`` + It indicates that the statistic will always be either 0 or 1. Boolean + statistics of "peak" type will never go back from 1 to 0. Boolean + statistics can be linear histograms (with two buckets) but not logarithmic + histograms. Bits 8-11 of ``flags``, together with ``exponent``, encode the scale of the unit: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 26d0cac32f73..0c3e85e8fce9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -298,7 +298,7 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_COUNTER(VCPU, preemption_reported), STATS_DESC_COUNTER(VCPU, preemption_other), - STATS_DESC_ICOUNTER(VCPU, guest_mode) + STATS_DESC_IBOOLEAN(VCPU, guest_mode) }; const struct kvm_stats_header kvm_vcpu_stats_header = { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 83cf7fd842e0..90a45ef7203b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1822,6 +1822,15 @@ struct _kvm_stats_desc { STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) +/* Instantaneous boolean value, read only */ +#define STATS_DESC_IBOOLEAN(SCOPE, name) \ + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BOOLEAN, \ + KVM_STATS_BASE_POW10, 0) +/* Peak (sticky) boolean value, read/write */ +#define STATS_DESC_PBOOLEAN(SCOPE, name) \ + STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_BOOLEAN, \ + KVM_STATS_BASE_POW10, 0) + /* Cumulative time in nanosecond */ #define STATS_DESC_TIME_NSEC(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ @@ -1853,7 +1862,7 @@ struct _kvm_stats_desc { HALT_POLL_HIST_COUNT), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist, \ HALT_POLL_HIST_COUNT), \ - STATS_DESC_ICOUNTER(VCPU_GENERIC, blocking) + STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking) extern struct dentry *kvm_debugfs_dir; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5088bd9f1922..811897dadcae 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -2083,6 +2083,7 @@ struct kvm_stats_header { #define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES #define KVM_STATS_BASE_SHIFT 8 From 942d9e89524c135615e557fffa144104ea8fb361 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 14 Jul 2022 07:29:57 -0400 Subject: [PATCH 108/248] Documentation: kvm: clarify histogram units In the case of histogram statistics, the values are always sample counts; the unit instead applies to the bucket range. For example, halt_poll_success_hist is a nanosecond statistic because the buckets are for 0ns, 1ns, 2-3ns, 4-7ns etc. There isn't really any other sensible interpretation, but clarify this anyway in the Documentation. Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 48bf6e49a7de..6e090fb96a0e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5703,14 +5703,13 @@ Bits 0-3 of ``flags`` encode the type: by the ``hist_param`` field. The range of the Nth bucket (1 <= N < ``size``) is [``hist_param``*(N-1), ``hist_param``*N), while the range of the last bucket is [``hist_param``*(``size``-1), +INF). (+INF means positive infinity - value.) The bucket value indicates how many samples fell in the bucket's range. + value.) * ``KVM_STATS_TYPE_LOG_HIST`` The statistic is reported as a logarithmic histogram. The number of buckets is specified by the ``size`` field. The range of the first bucket is [0, 1), while the range of the last bucket is [pow(2, ``size``-2), +INF). Otherwise, The Nth bucket (1 < N < ``size``) covers - [pow(2, N-2), pow(2, N-1)). The bucket value indicates how many samples fell - in the bucket's range. + [pow(2, N-2), pow(2, N-1)). Bits 4-7 of ``flags`` encode the unit: @@ -5731,6 +5730,10 @@ Bits 4-7 of ``flags`` encode the unit: statistics can be linear histograms (with two buckets) but not logarithmic histograms. +Note that, in the case of histograms, the unit applies to the bucket +ranges, while the bucket value indicates how many samples fell in the +bucket's range. + Bits 8-11 of ``flags``, together with ``exponent``, encode the scale of the unit: @@ -5752,7 +5755,7 @@ the corresponding statistics data. The ``bucket_size`` field is used as a parameter for histogram statistics data. It is only used by linear histogram statistics data, specifying the size of a -bucket. +bucket in the unit expressed by bits 4-11 of ``flags`` together with ``exponent``. The ``name`` field is the name string of the statistics data. The name string starts at the end of ``struct kvm_stats_desc``. The maximum length including From 73d5fe046270281a46344e06bf986c607632f7ea Mon Sep 17 00:00:00 2001 From: Vaishnav Achath Date: Wed, 1 Jun 2022 12:46:11 +0530 Subject: [PATCH 109/248] spi: cadence-quadspi: Remove spi_master_put() in probe failure path Currently the spi_master is allocated by devm_spi_alloc_master() and devres core manages the deallocation, but in probe failure path spi_master_put() is being handled manually which causes "refcount underflow use-after-free" warning when probe failure happens after allocating spi_master. Trimmed backtrace during failure: refcount_t: underflow; use-after-free. pc : refcount_warn_saturate+0xf4/0x144 Call trace: refcount_warn_saturate kobject_put put_device devm_spi_release_controller devres_release_all This commit makes relevant changes to remove spi_master_put() from probe failure path. Fixes: 606e5d408184 ("spi: cadence-quadspi: Handle spi_unregister_master() in remove()") Signed-off-by: Vaishnav Achath Link: https://lore.kernel.org/r/20220601071611.11853-1-vaishnav.a@ti.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 2b9fc8449a62..72b1a5a2298c 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -1578,8 +1578,7 @@ static int cqspi_probe(struct platform_device *pdev) ret = cqspi_of_get_pdata(cqspi); if (ret) { dev_err(dev, "Cannot get mandatory OF data.\n"); - ret = -ENODEV; - goto probe_master_put; + return -ENODEV; } /* Obtain QSPI clock. */ @@ -1587,7 +1586,7 @@ static int cqspi_probe(struct platform_device *pdev) if (IS_ERR(cqspi->clk)) { dev_err(dev, "Cannot claim QSPI clock.\n"); ret = PTR_ERR(cqspi->clk); - goto probe_master_put; + return ret; } /* Obtain and remap controller address. */ @@ -1596,7 +1595,7 @@ static int cqspi_probe(struct platform_device *pdev) if (IS_ERR(cqspi->iobase)) { dev_err(dev, "Cannot remap controller address.\n"); ret = PTR_ERR(cqspi->iobase); - goto probe_master_put; + return ret; } /* Obtain and remap AHB address. */ @@ -1605,7 +1604,7 @@ static int cqspi_probe(struct platform_device *pdev) if (IS_ERR(cqspi->ahb_base)) { dev_err(dev, "Cannot remap AHB address.\n"); ret = PTR_ERR(cqspi->ahb_base); - goto probe_master_put; + return ret; } cqspi->mmap_phys_base = (dma_addr_t)res_ahb->start; cqspi->ahb_size = resource_size(res_ahb); @@ -1614,15 +1613,13 @@ static int cqspi_probe(struct platform_device *pdev) /* Obtain IRQ line. */ irq = platform_get_irq(pdev, 0); - if (irq < 0) { - ret = -ENXIO; - goto probe_master_put; - } + if (irq < 0) + return -ENXIO; pm_runtime_enable(dev); ret = pm_runtime_resume_and_get(dev); if (ret < 0) - goto probe_master_put; + return ret; ret = clk_prepare_enable(cqspi->clk); if (ret) { @@ -1716,8 +1713,6 @@ probe_reset_failed: probe_clk_failed: pm_runtime_put_sync(dev); pm_runtime_disable(dev); -probe_master_put: - spi_master_put(master); return ret; } From 081f5e753c9c4cd1dd86000bcc7f5fe14cbdcab0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Jul 2022 08:16:32 -0700 Subject: [PATCH 110/248] nvme-pci: fix freeze accounting for error handling A reset on a live device experiencing a link error still needs to have the queue freeze state started for the subsequent reinitialization. Skip only the register read if the device is not present instead of bypassing the freeze checks. Fixes: b98235d3a471e ("nvme-pci: harden drive presence detect in nvme_dev_disable()") Reported-by: Niklas Schnelle Signed-off-by: Keith Busch Tested-by: Niklas Schnelle Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 193b44755662..58c72d55769a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2690,8 +2690,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) struct pci_dev *pdev = to_pci_dev(dev->dev); mutex_lock(&dev->shutdown_lock); - if (pci_device_is_present(pdev) && pci_is_enabled(pdev)) { - u32 csts = readl(dev->bar + NVME_REG_CSTS); + if (pci_is_enabled(pdev)) { + u32 csts; + + if (pci_device_is_present(pdev)) + csts = readl(dev->bar + NVME_REG_CSTS); + else + csts = ~0; if (dev->ctrl.state == NVME_CTRL_LIVE || dev->ctrl.state == NVME_CTRL_RESETTING) { From 6961b5e02876b3b47f030a1f1ee8fd3e631ac270 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Thu, 14 Jul 2022 12:42:10 +0000 Subject: [PATCH 111/248] nvme: fix block device naming collision The issue exists when multipath is enabled and the namespace is shared, but all the other controller checks at nvme_is_unique_nsid() are false. The reason for this issue is that nvme_is_unique_nsid() returns false when is called from nvme_mpath_alloc_disk() due to an uninitialized value of head->shared. The patch fixes it by setting head->shared before nvme_mpath_alloc_disk() is called. Fixes: 5974ea7ce0f9 ("nvme: allow duplicate NSIDs for private namespaces") Signed-off-by: Israel Rukshin Reviewed-by: Keith Busch Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ec6ac298d8de..6a12a906a11e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3786,7 +3786,7 @@ static int nvme_add_ns_cdev(struct nvme_ns *ns) } static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, - unsigned nsid, struct nvme_ns_ids *ids) + unsigned nsid, struct nvme_ns_ids *ids, bool is_shared) { struct nvme_ns_head *head; size_t size = sizeof(*head); @@ -3810,6 +3810,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->subsys = ctrl->subsys; head->ns_id = nsid; head->ids = *ids; + head->shared = is_shared; kref_init(&head->ref); if (head->ids.csi) { @@ -3891,12 +3892,11 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, nsid); goto out_unlock; } - head = nvme_alloc_ns_head(ctrl, nsid, ids); + head = nvme_alloc_ns_head(ctrl, nsid, ids, is_shared); if (IS_ERR(head)) { ret = PTR_ERR(head); goto out_unlock; } - head->shared = is_shared; } else { ret = -EINVAL; if (!is_shared || !head->shared) { From 8a414f943f8b5f94bbaafdec863d6f3dbef33f8a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 8 Jul 2022 14:51:47 +0200 Subject: [PATCH 112/248] KVM: x86: Fully initialize 'struct kvm_lapic_irq' in kvm_pv_kick_cpu_op() 'vector' and 'trig_mode' fields of 'struct kvm_lapic_irq' are left uninitialized in kvm_pv_kick_cpu_op(). While these fields are normally not needed for APIC_DM_REMRD, they're still referenced by __apic_accept_irq() for trace_kvm_apic_accept_irq(). Fully initialize the structure to avoid consuming random stack memory. Fixes: a183b638b61c ("KVM: x86: make apic_accept_irq tracepoint more generic") Reported-by: syzbot+d6caa905917d353f0d07@syzkaller.appspotmail.com Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Message-Id: <20220708125147.593975-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c3e85e8fce9..143e37298d8a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9143,15 +9143,17 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, */ static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) { - struct kvm_lapic_irq lapic_irq; + /* + * All other fields are unused for APIC_DM_REMRD, but may be consumed by + * common code, e.g. for tracing. Defer initialization to the compiler. + */ + struct kvm_lapic_irq lapic_irq = { + .delivery_mode = APIC_DM_REMRD, + .dest_mode = APIC_DEST_PHYSICAL, + .shorthand = APIC_DEST_NOSHORT, + .dest_id = apicid, + }; - lapic_irq.shorthand = APIC_DEST_NOSHORT; - lapic_irq.dest_mode = APIC_DEST_PHYSICAL; - lapic_irq.level = 0; - lapic_irq.dest_id = apicid; - lapic_irq.msi_redir_hint = false; - - lapic_irq.delivery_mode = APIC_DM_REMRD; kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } From b49feacbeffc7635cc6692cbcc6a1eae2c17da6f Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Sun, 8 May 2022 10:09:05 +0300 Subject: [PATCH 113/248] e1000e: Enable GPT clock before sending message to CSME On corporate (CSME) ADL systems, the Ethernet Controller may stop working ("HW unit hang") after exiting from the s0ix state. The reason is that CSME misses the message sent by the host. Enabling the dynamic GPT clock solves this problem. This clock is cleared upon HW initialization. Fixes: 3e55d231716e ("e1000e: Add handshake with the CSME to support S0ix") Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=214821 Reviewed-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Chia-Lin Kao (AceLan) Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/netdev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index fa06f68c8c80..c64102b29862 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6494,6 +6494,10 @@ static void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter) if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && hw->mac.type >= e1000_pch_adp) { + /* Keep the GPT clock enabled for CSME */ + mac_data = er32(FEXTNVM); + mac_data |= BIT(3); + ew32(FEXTNVM, mac_data); /* Request ME unconfigure the device from S0ix */ mac_data = er32(H2ME); mac_data &= ~E1000_H2ME_START_DPG; From 6cfa45361d3eac31ba67d7d0bbef547151450106 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Mon, 9 May 2022 11:52:54 +0300 Subject: [PATCH 114/248] Revert "e1000e: Fix possible HW unit hang after an s0ix exit" This reverts commit 1866aa0d0d6492bc2f8d22d0df49abaccf50cddd. Commit 1866aa0d0d64 ("e1000e: Fix possible HW unit hang after an s0ix exit") was a workaround for CSME problem to handle messages comes via H2ME mailbox. This problem has been fixed by patch "e1000e: Enable the GPT clock before sending message to the CSME". Fixes: 3e55d231716e ("e1000e: Add handshake with the CSME to support S0ix") Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=214821 Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/hw.h | 1 - drivers/net/ethernet/intel/e1000e/ich8lan.c | 4 ---- drivers/net/ethernet/intel/e1000e/ich8lan.h | 1 - drivers/net/ethernet/intel/e1000e/netdev.c | 26 --------------------- 4 files changed, 32 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h index 13382df2f2ef..bcf680e83811 100644 --- a/drivers/net/ethernet/intel/e1000e/hw.h +++ b/drivers/net/ethernet/intel/e1000e/hw.h @@ -630,7 +630,6 @@ struct e1000_phy_info { bool disable_polarity_correction; bool is_mdix; bool polarity_correction; - bool reset_disable; bool speed_downgraded; bool autoneg_wait_to_complete; }; diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index e6c8e6d5234f..9466f65a6da7 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -2050,10 +2050,6 @@ static s32 e1000_check_reset_block_ich8lan(struct e1000_hw *hw) bool blocked = false; int i = 0; - /* Check the PHY (LCD) reset flag */ - if (hw->phy.reset_disable) - return true; - while ((blocked = !(er32(FWSM) & E1000_ICH_FWSM_RSPCIPHY)) && (i++ < 30)) usleep_range(10000, 11000); diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.h b/drivers/net/ethernet/intel/e1000e/ich8lan.h index 638a3ddd7ada..2504b11c3169 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.h +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.h @@ -271,7 +271,6 @@ #define I217_CGFREG_ENABLE_MTA_RESET 0x0002 #define I217_MEMPWR PHY_REG(772, 26) #define I217_MEMPWR_DISABLE_SMB_RELEASE 0x0010 -#define I217_MEMPWR_MOEM 0x1000 /* Receive Address Initial CRC Calculation */ #define E1000_PCH_RAICC(_n) (0x05F50 + ((_n) * 4)) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index c64102b29862..f1729940e46c 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6991,21 +6991,8 @@ static __maybe_unused int e1000e_pm_suspend(struct device *dev) struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = to_pci_dev(dev); - struct e1000_hw *hw = &adapter->hw; - u16 phy_data; int rc; - if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && - hw->mac.type >= e1000_pch_adp) { - /* Mask OEM Bits / Gig Disable / Restart AN (772_26[12] = 1) */ - e1e_rphy(hw, I217_MEMPWR, &phy_data); - phy_data |= I217_MEMPWR_MOEM; - e1e_wphy(hw, I217_MEMPWR, phy_data); - - /* Disable LCD reset */ - hw->phy.reset_disable = true; - } - e1000e_flush_lpic(pdev); e1000e_pm_freeze(dev); @@ -7027,8 +7014,6 @@ static __maybe_unused int e1000e_pm_resume(struct device *dev) struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = to_pci_dev(dev); - struct e1000_hw *hw = &adapter->hw; - u16 phy_data; int rc; /* Introduce S0ix implementation */ @@ -7039,17 +7024,6 @@ static __maybe_unused int e1000e_pm_resume(struct device *dev) if (rc) return rc; - if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && - hw->mac.type >= e1000_pch_adp) { - /* Unmask OEM Bits / Gig Disable / Restart AN 772_26[12] = 0 */ - e1e_rphy(hw, I217_MEMPWR, &phy_data); - phy_data &= ~I217_MEMPWR_MOEM; - e1e_wphy(hw, I217_MEMPWR, phy_data); - - /* Enable LCD reset */ - hw->phy.reset_disable = false; - } - return e1000e_pm_thaw(dev); } From 7c1ddcee5311f3315096217881d2dbe47cc683f9 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Thu, 2 Jun 2022 18:58:11 +0300 Subject: [PATCH 115/248] igc: Reinstate IGC_REMOVED logic and implement it properly The initially merged version of the igc driver code (via commit 146740f9abc4, "igc: Add support for PF") contained the following IGC_REMOVED checks in the igc_rd32/wr32() MMIO accessors: u32 igc_rd32(struct igc_hw *hw, u32 reg) { u8 __iomem *hw_addr = READ_ONCE(hw->hw_addr); u32 value = 0; if (IGC_REMOVED(hw_addr)) return ~value; value = readl(&hw_addr[reg]); /* reads should not return all F's */ if (!(~value) && (!reg || !(~readl(hw_addr)))) hw->hw_addr = NULL; return value; } And: #define wr32(reg, val) \ do { \ u8 __iomem *hw_addr = READ_ONCE((hw)->hw_addr); \ if (!IGC_REMOVED(hw_addr)) \ writel((val), &hw_addr[(reg)]); \ } while (0) E.g. igb has similar checks in its MMIO accessors, and has a similar macro E1000_REMOVED, which is implemented as follows: #define E1000_REMOVED(h) unlikely(!(h)) These checks serve to detect and take note of an 0xffffffff MMIO read return from the device, which can be caused by a PCIe link flap or some other kind of PCI bus error, and to avoid performing MMIO reads and writes from that point onwards. However, the IGC_REMOVED macro was not originally implemented: #ifndef IGC_REMOVED #define IGC_REMOVED(a) (0) #endif /* IGC_REMOVED */ This led to the IGC_REMOVED logic to be removed entirely in a subsequent commit (commit 3c215fb18e70, "igc: remove IGC_REMOVED function"), with the rationale that such checks matter only for virtualization and that igc does not support virtualization -- but a PCIe device can become detached even without virtualization being in use, and without proper checks, a PCIe bus error affecting an igc adapter will lead to various NULL pointer dereferences, as the first access after the error will set hw->hw_addr to NULL, and subsequent accesses will blindly dereference this now-NULL pointer. This patch reinstates the IGC_REMOVED checks in igc_rd32/wr32(), and implements IGC_REMOVED the way it is done for igb, by checking for the unlikely() case of hw_addr being NULL. This change prevents the oopses seen when a PCIe link flap occurs on an igc adapter. Fixes: 146740f9abc4 ("igc: Add support for PF") Signed-off-by: Lennert Buytenhek Tested-by: Naama Meir Acked-by: Sasha Neftin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 3 +++ drivers/net/ethernet/intel/igc/igc_regs.h | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index ae17af44fe02..a5ebee7df4a8 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6171,6 +6171,9 @@ u32 igc_rd32(struct igc_hw *hw, u32 reg) u8 __iomem *hw_addr = READ_ONCE(hw->hw_addr); u32 value = 0; + if (IGC_REMOVED(hw_addr)) + return ~value; + value = readl(&hw_addr[reg]); /* reads should not return all F's */ diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h index e197a33d93a0..026c3b65fc37 100644 --- a/drivers/net/ethernet/intel/igc/igc_regs.h +++ b/drivers/net/ethernet/intel/igc/igc_regs.h @@ -306,7 +306,8 @@ u32 igc_rd32(struct igc_hw *hw, u32 reg); #define wr32(reg, val) \ do { \ u8 __iomem *hw_addr = READ_ONCE((hw)->hw_addr); \ - writel((val), &hw_addr[(reg)]); \ + if (!IGC_REMOVED(hw_addr)) \ + writel((val), &hw_addr[(reg)]); \ } while (0) #define rd32(reg) (igc_rd32(hw, reg)) @@ -318,4 +319,6 @@ do { \ #define array_rd32(reg, offset) (igc_rd32(hw, (reg) + ((offset) << 2))) +#define IGC_REMOVED(h) unlikely(!(h)) + #endif From 957a2b345cbcf41b4b25d471229f0e35262f066c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 13 Jul 2022 22:02:26 +0800 Subject: [PATCH 116/248] block: fix missing blkcg_bio_issue_init The commit 513616843d73 ("block: remove superfluous calls to blkcg_bio_issue_init") has removed blkcg_bio_issue_init from __bio_clone since submit_bio will override ->bi_issue. However, __blk_queue_split is called after blkcg_bio_issue_init (see blk_mq_submit_bio) in submit_bio. In this case, the ->bi_issue is 0. Fix it. Fixes: 513616843d73 ("block: remove superfluous calls to blkcg_bio_issue_init") Signed-off-by: Muchun Song Link: https://lore.kernel.org/r/20220713140226.68135-1-songmuchun@bytedance.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 7771dacc99cb..f5e6527ebc9c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -345,6 +345,7 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, /* there isn't chance to merge the splitted bio */ split->bi_opf |= REQ_NOMERGE; + blkcg_bio_issue_init(split); bio_chain(split, *bio); trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); From 5ad26161a371e4aa2d2553286f0cac580987a493 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 13 Jul 2022 23:11:01 +0200 Subject: [PATCH 117/248] ACPI: video: Fix acpi_video_handles_brightness_key_presses() Commit 3a0cf7ab8df3 ("ACPI: video: Change how we determine if brightness key-presses are handled") made acpi_video_handles_brightness_key_presses() report false when none of the ACPI Video Devices support backlight control. But it turns out that at least on a Dell Inspiron N4010 there is no ACPI backlight control, yet brightness hotkeys are still reported through the ACPI Video Bus; and since acpi_video_handles_brightness_key_presses() now returns false, brightness keypresses are now reported twice. To fix this rename the has_backlight flag to may_report_brightness_keys and also set it the first time a brightness key press event is received. Depending on the delivery of the other ACPI (WMI) event vs the ACPI Video Bus event this means that the first brightness key press might still get reported twice, but all further keypresses will be filtered as before. Note that this relies on other drivers reporting brightness key events calling acpi_video_handles_brightness_key_presses() when delivering the events (rather then once during driver probe). This is already required and documented in include/acpi/video.h: /* * Note: The value returned by acpi_video_handles_brightness_key_presses() * may change over time and should not be cached. */ Fixes: 3a0cf7ab8df3 ("ACPI: video: Change how we determine if brightness key-presses are handled") Link: https://lore.kernel.org/regressions/CALF=6jEe5G8+r1Wo0vvz4GjNQQhdkLT5p8uCHn6ZXhg4nsOWow@mail.gmail.com/ Reported-and-tested-by: Ben Greening Signed-off-by: Hans de Goede Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20220713211101.85547-2-hdegoede@redhat.com --- drivers/acpi/acpi_video.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index 43177c20ce4f..eaea733b368a 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -73,7 +73,7 @@ module_param(device_id_scheme, bool, 0444); static int only_lcd = -1; module_param(only_lcd, int, 0444); -static bool has_backlight; +static bool may_report_brightness_keys; static int register_count; static DEFINE_MUTEX(register_count_mutex); static DEFINE_MUTEX(video_list_lock); @@ -1224,7 +1224,7 @@ acpi_video_bus_get_one_device(struct acpi_device *device, acpi_video_device_find_cap(data); if (data->cap._BCM && data->cap._BCL) - has_backlight = true; + may_report_brightness_keys = true; mutex_lock(&video->device_list_lock); list_add_tail(&data->entry, &video->video_device_list); @@ -1693,6 +1693,9 @@ static void acpi_video_device_notify(acpi_handle handle, u32 event, void *data) break; } + if (keycode) + may_report_brightness_keys = true; + acpi_notifier_call_chain(device, event, 0); if (keycode && (report_key_events & REPORT_BRIGHTNESS_KEY_EVENTS)) { @@ -2253,7 +2256,7 @@ void acpi_video_unregister(void) if (register_count) { acpi_bus_unregister_driver(&acpi_video_bus); register_count = 0; - has_backlight = false; + may_report_brightness_keys = false; } mutex_unlock(®ister_count_mutex); } @@ -2275,7 +2278,7 @@ void acpi_video_unregister_backlight(void) bool acpi_video_handles_brightness_key_presses(void) { - return has_backlight && + return may_report_brightness_keys && (report_key_events & REPORT_BRIGHTNESS_KEY_EVENTS); } EXPORT_SYMBOL(acpi_video_handles_brightness_key_presses); From 43b5240ca6b33108998810593248186b1e3ae34a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 9 Jun 2022 18:40:32 +0800 Subject: [PATCH 118/248] mm: sysctl: fix missing numa_stat when !CONFIG_HUGETLB_PAGE "numa_stat" should not be included in the scope of CONFIG_HUGETLB_PAGE, if CONFIG_HUGETLB_PAGE is not configured even if CONFIG_NUMA is configured, "numa_stat" is missed form /proc. Move it out of CONFIG_HUGETLB_PAGE to fix it. Fixes: 4518085e127d ("mm, sysctl: make NUMA stats configurable") Signed-off-by: Muchun Song Cc: Acked-by: Michal Hocko Acked-by: Mel Gorman Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e52b6e372c60..aaf0b1f1dc57 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2091,6 +2091,17 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO_HUNDRED, }, +#ifdef CONFIG_NUMA + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif #ifdef CONFIG_HUGETLB_PAGE { .procname = "nr_hugepages", @@ -2107,15 +2118,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { .procname = "hugetlb_shm_group", From b926f2adb0442090351dc8321ec1f99b22e372da Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Jul 2022 15:35:24 -0700 Subject: [PATCH 119/248] Revert "vf/remap: return the amount of bytes actually deduplicated" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 4a57a8400075bc5287c5c877702c68aeae2a033d. Dave Chinner reports: "As I suspected would occur, this change causes test failures. e.g generic/517 in fstests fails with: generic/517 1s ... - output mismatch [..] -deduped 131172/131172 bytes at offset 65536 +deduped 131072/131172 bytes at offset 65536" can you please revert this commit for the 5.19 series to give us more time to investigate and consider the impact of the the API change on userspace applications before we commit to changing the API" That changed return value seems to reflect reality, but with the fstest change, let's revert for now. Requested-by: Dave Chinner Link: https://lore.kernel.org/all/20220714223238.GH3600936@dread.disaster.area/ Cc: Ansgar Lößer Signed-off-by: Linus Torvalds --- fs/remap_range.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/remap_range.c b/fs/remap_range.c index 5e0d97e02f96..881a306ee247 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -547,7 +547,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) else if (deduped < 0) info->status = deduped; else - info->bytes_deduped = deduped; + info->bytes_deduped = len; next_fdput: fdput(dst_fd); From e5d523f1ae8f2cef01f8e071aeee432654166708 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 14 Jul 2022 13:56:43 -0700 Subject: [PATCH 120/248] ubsan: disable UBSAN_DIV_ZERO for clang Building with UBSAN_DIV_ZERO with clang produces numerous fallthrough warnings from objtool. In the case of uncheck division, UBSAN_DIV_ZERO may introduce new control flow to check for division by zero. Because the result of the division is undefined, LLVM may optimize the control flow such that after the call to __ubsan_handle_divrem_overflow doesn't matter. If panic_on_warn was set, __ubsan_handle_divrem_overflow would panic. The problem is is that panic_on_warn is run time configurable. If it's disabled, then we cannot guarantee that we will be able to recover safely. Disable this config for clang until we can come up with a solution in LLVM. Link: https://github.com/ClangBuiltLinux/linux/issues/1657 Link: https://github.com/llvm/llvm-project/issues/56289 Link: https://lore.kernel.org/lkml/CAHk-=wj1qhf7y3VNACEexyp5EbkNpdcu_542k-xZpzmYLOjiCg@mail.gmail.com/ Reported-by: Sudip Mukherjee Suggested-by: Linus Torvalds Signed-off-by: Nick Desaulniers Acked-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- lib/Kconfig.ubsan | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index a9f7eb047768..fd15230a703b 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -84,6 +84,9 @@ config UBSAN_SHIFT config UBSAN_DIV_ZERO bool "Perform checking for integer divide-by-zero" depends on $(cc-option,-fsanitize=integer-divide-by-zero) + # https://github.com/ClangBuiltLinux/linux/issues/1657 + # https://github.com/llvm/llvm-project/issues/56289 + depends on !CC_IS_CLANG help This option enables -fsanitize=integer-divide-by-zero which checks for integer division by zero. This is effectively redundant with the From 2f23256c0ea20627c91ea2d468cda945f68c3395 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Wed, 6 Jul 2022 17:43:29 -0400 Subject: [PATCH 121/248] s390/ap: fix error handling in __verify_queue_reservations() The AP bus's __verify_queue_reservations function increments the ref count for the device driver passed in as a parameter, but fails to decrement it before returning control to the caller. This will prevents any subsequent removal of the module. Signed-off-by: Tony Krowiak Reported-by: Tony Krowiak Reviewed-by: Harald Freudenberger Fixes: 4f8206b88286 ("s390/ap: driver callback to indicate resource in use") Link: https://lore.kernel.org/r/20220706222619.602094-1-akrowiak@linux.ibm.com Cc: stable@vger.kernel.org [agordeev@linux.ibm.com fixed description, added Fixes and Link] Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/ap_bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 5c13d2079d96..0a9045b49c50 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1435,7 +1435,7 @@ static int __verify_queue_reservations(struct device_driver *drv, void *data) if (ap_drv->in_use) { rc = ap_drv->in_use(ap_perms.apm, newaqm); if (rc) - return -EBUSY; + rc = -EBUSY; } /* release the driver's module */ From 9023ca0866250d268b047f21e1392e7a81277a54 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Fri, 15 Jul 2022 08:16:42 +0206 Subject: [PATCH 122/248] printk: do not wait for consoles when suspended The console_stop() and console_start() functions call pr_flush(). When suspending, these functions are called by the serial subsystem while the serial port is suspended. In this scenario, if there are any pending messages, a call to pr_flush() will always result in a timeout because the serial port cannot make forward progress. This causes longer suspend and resume times. Add a check in pr_flush() so that it will immediately timeout if the consoles are suspended. Fixes: 3b604ca81202 ("printk: add pr_flush()") Reported-by: Todd Brandt Signed-off-by: John Ogness Tested-by: Todd Brandt Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220715061042.373640-2-john.ogness@linutronix.de --- kernel/printk/printk.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b49c6ff6dca0..a1a81fd9889b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3380,6 +3380,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre diff = 0; console_lock(); + for_each_console(c) { if (con && con != c) continue; @@ -3389,11 +3390,19 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre if (printk_seq < seq) diff += seq - printk_seq; } - console_unlock(); - if (diff != last_diff && reset_on_progress) + /* + * If consoles are suspended, it cannot be expected that they + * make forward progress, so timeout immediately. @diff is + * still used to return a valid flush status. + */ + if (console_suspended) + remaining = 0; + else if (diff != last_diff && reset_on_progress) remaining = timeout_ms; + console_unlock(); + if (diff == 0 || remaining == 0) break; From 8281b7ec5c56b71cb2cc5a1728b41607be66959c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:51:51 -0700 Subject: [PATCH 123/248] ip: Fix data-races around sysctl_ip_default_ttl. While reading sysctl_ip_default_ttl, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/action.c | 2 +- include/net/route.h | 2 +- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/netfilter/nf_reject_ipv4.c | 4 ++-- net/ipv4/proc.c | 2 +- net/netfilter/nf_synproxy_core.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c index 0147de405365..ffb6f6d05a07 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/action.c +++ b/drivers/net/ethernet/netronome/nfp/flower/action.c @@ -474,7 +474,7 @@ nfp_fl_set_tun(struct nfp_app *app, struct nfp_fl_set_tun *set_tun, set_tun->ttl = ip4_dst_hoplimit(&rt->dst); ip_rt_put(rt); } else { - set_tun->ttl = net->ipv4.sysctl_ip_default_ttl; + set_tun->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); } } diff --git a/include/net/route.h b/include/net/route.h index 991a3985712d..bbcf2aba149f 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -373,7 +373,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) struct net *net = dev_net(dst->dev); if (hoplimit == 0) - hoplimit = net->ipv4.sysctl_ip_default_ttl; + hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); return hoplimit; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 445a9ecaefa1..d497d525dea3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1606,7 +1606,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, { struct net *net = sock_net(sk); val = (inet->uc_ttl == -1 ? - net->ipv4.sysctl_ip_default_ttl : + READ_ONCE(net->ipv4.sysctl_ip_default_ttl) : inet->uc_ttl); break; } diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 918c61fda0f3..d640adcaf1b1 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -62,7 +62,7 @@ struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, - net->ipv4.sysctl_ip_default_ttl); + READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); niph->tot_len = htons(nskb->len); ip_send_check(niph); @@ -117,7 +117,7 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, - net->ipv4.sysctl_ip_default_ttl); + READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); skb_reset_transport_header(nskb); icmph = skb_put_zero(nskb, sizeof(struct icmphdr)); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 28836071f0a6..0088a4c64d77 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -387,7 +387,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, - net->ipv4.sysctl_ip_default_ttl); + READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index e479dd0561c5..16915f8eef2b 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -405,7 +405,7 @@ synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, iph->tos = 0; iph->id = 0; iph->frag_off = htons(IP_DF); - iph->ttl = net->ipv4.sysctl_ip_default_ttl; + iph->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); iph->protocol = IPPROTO_TCP; iph->check = 0; iph->saddr = saddr; From 0968d2a441bf6afb551fd99e60fa65ed67068963 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:51:52 -0700 Subject: [PATCH 124/248] ip: Fix data-races around sysctl_ip_no_pmtu_disc. While reading sysctl_ip_no_pmtu_disc, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/icmp.c | 2 +- net/ipv6/af_inet6.c | 2 +- net/xfrm/xfrm_state.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ac67f6b4ec70..4bc24f9e38b3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -335,7 +335,7 @@ lookup_protocol: inet->hdrincl = 1; } - if (net->ipv4.sysctl_ip_no_pmtu_disc) + if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 57c4f0d87a7a..d5d745c3e345 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -881,7 +881,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) * values please see * Documentation/networking/ip-sysctl.rst */ - switch (net->ipv4.sysctl_ip_no_pmtu_disc) { + switch (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) { default: net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n", &iph->daddr); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 70564ddccc46..6f354f8be2c5 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -226,7 +226,7 @@ lookup_protocol: RCU_INIT_POINTER(inet->mc_list, NULL); inet->rcv_tos = 0; - if (net->ipv4.sysctl_ip_no_pmtu_disc) + if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 08564e0eef20..ccfb172eb5b8 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2620,7 +2620,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) int err; if (family == AF_INET && - xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) + READ_ONCE(xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)) x->props.flags |= XFRM_STATE_NOPMTUDISC; err = -EPROTONOSUPPORT; From 60c158dc7b1f0558f6cadd5b50d0386da0000d50 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:51:53 -0700 Subject: [PATCH 125/248] ip: Fix data-races around sysctl_ip_fwd_use_pmtu. While reading sysctl_ip_fwd_use_pmtu, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: f87c10a8aa1e ("ipv4: introduce ip_dst_mtu_maybe_forward and protect forwarding path against pmtu spoofing") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- net/ipv4/route.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 26fffda78cca..05fe313f72fa 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -446,7 +446,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, struct net *net = dev_net(dst->dev); unsigned int mtu; - if (net->ipv4.sysctl_ip_fwd_use_pmtu || + if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { mtu = rt->rt_pmtu; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 356f535f3443..91c4f60de75a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1398,7 +1398,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) struct fib_info *fi = res->fi; u32 mtu = 0; - if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || + if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) || fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) mtu = fi->fib_mtu; From 7bf9e18d9a5e99e3c83482973557e9f047b051e7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:51:54 -0700 Subject: [PATCH 126/248] ip: Fix data-races around sysctl_ip_fwd_update_priority. While reading sysctl_ip_fwd_update_priority, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 432e05d32892 ("net: ipv4: Control SKB reprioritization after forwarding") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 ++- net/ipv4/ip_forward.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 0d8a0068e4ca..868d28f3b4e1 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -10523,13 +10523,14 @@ static int mlxsw_sp_dscp_init(struct mlxsw_sp *mlxsw_sp) static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) { struct net *net = mlxsw_sp_net(mlxsw_sp); - bool usp = net->ipv4.sysctl_ip_fwd_update_priority; char rgcr_pl[MLXSW_REG_RGCR_LEN]; u64 max_rifs; + bool usp; if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_RIFS)) return -EIO; max_rifs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); + usp = READ_ONCE(net->ipv4.sysctl_ip_fwd_update_priority); mlxsw_reg_rgcr_pack(rgcr_pl, true, true); mlxsw_reg_rgcr_max_router_interfaces_set(rgcr_pl, max_rifs); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index e3aa436a1bdf..e18931a6d153 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -157,7 +157,7 @@ int ip_forward(struct sk_buff *skb) !skb_sec_path(skb)) ip_rt_send_redirect(skb); - if (net->ipv4.sysctl_ip_fwd_update_priority) + if (READ_ONCE(net->ipv4.sysctl_ip_fwd_update_priority)) skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, From 289d3b21fb0bfc94c4e98f10635bba1824e5f83c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:51:55 -0700 Subject: [PATCH 127/248] ip: Fix data-races around sysctl_ip_nonlocal_bind. While reading sysctl_ip_nonlocal_bind, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 +- net/sctp/protocol.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index daead5fb389a..68d337775564 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -374,7 +374,7 @@ static inline bool inet_get_convert_csum(struct sock *sk) static inline bool inet_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { - return net->ipv4.sysctl_ip_nonlocal_bind || + return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) || inet->freebind || inet->transparent; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 35928fefae33..1a094b087d88 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -358,7 +358,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !sp->inet.freebind && - !net->ipv4.sysctl_ip_nonlocal_bind) + !READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind)) return 0; if (ipv6_only_sock(sctp_opt2sk(sp))) From 0db232765887d9807df8bcb7b6f29b2871539eab Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:51:56 -0700 Subject: [PATCH 128/248] ip: Fix a data-race around sysctl_ip_autobind_reuse. While reading sysctl_ip_autobind_reuse, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 4b01a9674231 ("tcp: bind(0) remove the SO_REUSEADDR restriction when ephemeral ports are exhausted.") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 53f5f956d948..2c44556af452 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -263,7 +263,7 @@ next_port: goto other_half_scan; } - if (net->ipv4.sysctl_ip_autobind_reuse && !relax) { + if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) { /* We still have a chance to connect to different destinations */ relax = true; goto ports_exhausted; From 85d0b4dbd74b95cc492b1f4e34497d3f894f5d9a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:51:57 -0700 Subject: [PATCH 129/248] ip: Fix a data-race around sysctl_fwmark_reflect. While reading sysctl_fwmark_reflect, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: e110861f8609 ("net: add a sysctl to reflect the fwmark on replies") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/ip.h b/include/net/ip.h index 05fe313f72fa..4a15b6bcb4b8 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -384,7 +384,7 @@ void ipfrag_init(void); void ip_static_sysctl_init(void); #define IP4_REPLY_MARK(net, mark) \ - ((net)->ipv4.sysctl_fwmark_reflect ? (mark) : 0) + (READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0) static inline bool ip_is_fragment(const struct iphdr *iph) { From 1a0008f9df59451d0a17806c1ee1a19857032fa8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:51:58 -0700 Subject: [PATCH 130/248] tcp/dccp: Fix a data-race around sysctl_tcp_fwmark_accept. While reading sysctl_tcp_fwmark_accept, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 84f39b08d786 ("net: support marking accepting TCP sockets") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/inet_sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 68d337775564..b29108f0973a 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -107,7 +107,8 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { - if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) + if (!sk->sk_mark && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)) return skb->mark; return sk->sk_mark; From 08a75f10679470552a3a443f9aefd1399604d31d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:51:59 -0700 Subject: [PATCH 131/248] tcp: Fix data-races around sysctl_tcp_l3mdev_accept. While reading sysctl_tcp_l3mdev_accept, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 6dd9a14e92e5 ("net: Allow accepted sockets to be bound to l3mdev domain") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 2 +- include/net/inet_sock.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ebfa3df6f8dc..fd6b510d114b 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -179,7 +179,7 @@ static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - return inet_bound_dev_eq(!!net->ipv4.sysctl_tcp_l3mdev_accept, + return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept), bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index b29108f0973a..6395f6b9a5d2 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -121,7 +121,7 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); - if (!bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept) + if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept)) return l3mdev_master_ifindex_by_index(net, skb->skb_iif); #endif @@ -133,7 +133,7 @@ static inline int inet_sk_bound_l3mdev(const struct sock *sk) #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); - if (!net->ipv4.sysctl_tcp_l3mdev_accept) + if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept)) return l3mdev_master_ifindex_by_index(net, sk->sk_bound_dev_if); #endif From f47d00e077e7d61baf69e46dde3210c886360207 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:52:00 -0700 Subject: [PATCH 132/248] tcp: Fix data-races around sysctl_tcp_mtu_probing. While reading sysctl_tcp_mtu_probing, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 5d424d5a674f ("[TCP]: MTU probing") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 11aa0ab10bba..3fcfc0f1e9f9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1762,7 +1762,7 @@ void tcp_mtup_init(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); - icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1; + icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 20cf4a98c69d..98bb00e29e1e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -163,7 +163,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) int mss; /* Black hole detection */ - if (!net->ipv4.sysctl_tcp_mtu_probing) + if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing)) return; if (!icsk->icsk_mtup.enabled) { From 88d78bc097cd8ebc6541e93316c9d9bf651b13e8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:52:01 -0700 Subject: [PATCH 133/248] tcp: Fix data-races around sysctl_tcp_base_mss. While reading sysctl_tcp_base_mss, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 5d424d5a674f ("[TCP]: MTU probing") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3fcfc0f1e9f9..9450d8469871 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1765,7 +1765,7 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss)); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 98bb00e29e1e..04063c7e33ba 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -171,7 +171,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } else { mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; - mss = min(net->ipv4.sysctl_tcp_base_mss, mss); + mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss); mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor); mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); From 78eb166cdefcc3221c8c7c1e2d514e91a2eb5014 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:52:02 -0700 Subject: [PATCH 134/248] tcp: Fix data-races around sysctl_tcp_min_snd_mss. While reading sysctl_tcp_min_snd_mss, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 5f3e2bf008c2 ("tcp: add tcp_min_snd_mss sysctl") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 ++- net/ipv4/tcp_timer.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9450d8469871..7130b405da21 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1719,7 +1719,8 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ - mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss); + mss_now = max(mss_now, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss)); return mss_now; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 04063c7e33ba..39107bb730b0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -173,7 +173,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss); mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor); - mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); + mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss)); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); From 8e92d4423615a5257d0d871fc067aa561f597deb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:52:03 -0700 Subject: [PATCH 135/248] tcp: Fix a data-race around sysctl_tcp_mtu_probe_floor. While reading sysctl_tcp_mtu_probe_floor, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: c04b79b6cfd7 ("tcp: add new tcp_mtu_probe_floor sysctl") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 39107bb730b0..4f3b9ab222b6 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -172,7 +172,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) } else { mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss); - mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor); + mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor)); mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss)); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } From 92c0aa4175474483d6cf373314343d4e624e882a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:52:04 -0700 Subject: [PATCH 136/248] tcp: Fix a data-race around sysctl_tcp_probe_threshold. While reading sysctl_tcp_probe_threshold, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 6b58e0a5f32d ("ipv4: Use binary search to choose tcp PMTU probe_size") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7130b405da21..aa757c74dad4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2367,7 +2367,7 @@ static int tcp_mtu_probe(struct sock *sk) * probing process by not resetting search range to its orignal. */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || - interval < net->ipv4.sysctl_tcp_probe_threshold) { + interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) { /* Check whether enough time has elaplased for * another round of probing. */ From 2a85388f1d94a9f8b5a529118a2c5eaa0520d85c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 13:52:05 -0700 Subject: [PATCH 137/248] tcp: Fix a data-race around sysctl_tcp_probe_interval. While reading sysctl_tcp_probe_interval, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 05cbc0db03e8 ("ipv4: Create probe timer for tcp PMTU as per RFC4821") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index aa757c74dad4..02ab3a9c6657 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2283,7 +2283,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) u32 interval; s32 delta; - interval = net->ipv4.sysctl_tcp_probe_interval; + interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval); delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); From fa4b3ca60e8011d3046765b3de8d3f1ffc53af28 Mon Sep 17 00:00:00 2001 From: Biao Huang Date: Thu, 14 Jul 2022 14:00:12 +0800 Subject: [PATCH 138/248] stmmac: dwmac-mediatek: fix clock issue The pm_runtime takes care of the clock handling in current stmmac drivers, and dwmac-mediatek implement the mediatek_dwmac_clks_config() as the callback for pm_runtime. Then, stripping duplicated clocks handling in old init()/exit() to fix clock issue in suspend/resume test. As to clocks in probe/remove, vendor need symmetric handling to ensure clocks balance. Test pass, including suspend/resume and ko insertion/remove. Fixes: 3186bdad97d5 ("stmmac: dwmac-mediatek: add platform level clocks management") Signed-off-by: Biao Huang Reviewed-by: Matthias Brugger Signed-off-by: David S. Miller --- .../ethernet/stmicro/stmmac/dwmac-mediatek.c | 49 ++++++++----------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index 6ff88df58767..ca8ab290013c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -576,32 +576,7 @@ static int mediatek_dwmac_init(struct platform_device *pdev, void *priv) } } - ret = clk_bulk_prepare_enable(variant->num_clks, plat->clks); - if (ret) { - dev_err(plat->dev, "failed to enable clks, err = %d\n", ret); - return ret; - } - - ret = clk_prepare_enable(plat->rmii_internal_clk); - if (ret) { - dev_err(plat->dev, "failed to enable rmii internal clk, err = %d\n", ret); - goto err_clk; - } - return 0; - -err_clk: - clk_bulk_disable_unprepare(variant->num_clks, plat->clks); - return ret; -} - -static void mediatek_dwmac_exit(struct platform_device *pdev, void *priv) -{ - struct mediatek_dwmac_plat_data *plat = priv; - const struct mediatek_dwmac_variant *variant = plat->variant; - - clk_disable_unprepare(plat->rmii_internal_clk); - clk_bulk_disable_unprepare(variant->num_clks, plat->clks); } static int mediatek_dwmac_clks_config(void *priv, bool enabled) @@ -643,7 +618,6 @@ static int mediatek_dwmac_common_data(struct platform_device *pdev, plat->addr64 = priv_plat->variant->dma_bit_mask; plat->bsp_priv = priv_plat; plat->init = mediatek_dwmac_init; - plat->exit = mediatek_dwmac_exit; plat->clks_config = mediatek_dwmac_clks_config; if (priv_plat->variant->dwmac_fix_mac_speed) plat->fix_mac_speed = priv_plat->variant->dwmac_fix_mac_speed; @@ -712,13 +686,32 @@ static int mediatek_dwmac_probe(struct platform_device *pdev) mediatek_dwmac_common_data(pdev, plat_dat, priv_plat); mediatek_dwmac_init(pdev, priv_plat); + ret = mediatek_dwmac_clks_config(priv_plat, true); + if (ret) + return ret; + ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); if (ret) { stmmac_remove_config_dt(pdev, plat_dat); - return ret; + goto err_drv_probe; } return 0; + +err_drv_probe: + mediatek_dwmac_clks_config(priv_plat, false); + return ret; +} + +static int mediatek_dwmac_remove(struct platform_device *pdev) +{ + struct mediatek_dwmac_plat_data *priv_plat = get_stmmac_bsp_priv(&pdev->dev); + int ret; + + ret = stmmac_pltfr_remove(pdev); + mediatek_dwmac_clks_config(priv_plat, false); + + return ret; } static const struct of_device_id mediatek_dwmac_match[] = { @@ -733,7 +726,7 @@ MODULE_DEVICE_TABLE(of, mediatek_dwmac_match); static struct platform_driver mediatek_dwmac_driver = { .probe = mediatek_dwmac_probe, - .remove = stmmac_pltfr_remove, + .remove = mediatek_dwmac_remove, .driver = { .name = "dwmac-mediatek", .pm = &stmmac_pltfr_pm_ops, From 0d9a15913b871e03fdd3b3d90a2e665fb22f9bcf Mon Sep 17 00:00:00 2001 From: Biao Huang Date: Thu, 14 Jul 2022 14:00:13 +0800 Subject: [PATCH 139/248] net: stmmac: fix pm runtime issue in stmmac_dvr_remove() If netif is running when stmmac_dvr_remove is invoked, the unregister_netdev will call ndo_stop(stmmac_release) and vlan_kill_rx_filter(stmmac_vlan_rx_kill_vid). Currently, stmmac_dvr_remove() will disable pm runtime before unregister_netdev. When stmmac_vlan_rx_kill_vid is invoked, pm_runtime_resume_and_get in it returns EACCESS error number, and reports: dwmac-mediatek 11021000.ethernet eth0: stmmac_dvr_remove: removing driver dwmac-mediatek 11021000.ethernet eth0: FPE workqueue stop dwmac-mediatek 11021000.ethernet eth0: failed to kill vid 0081/0 Move the pm_runtime_disable to the end of stmmac_dvr_remove to fix this issue. Fixes: 6449520391dfc ("net: stmmac: properly handle with runtime pm in stmmac_dvr_remove()") Signed-off-by: Biao Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index d1a7cf4567bc..197fac587ad5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7213,8 +7213,6 @@ int stmmac_dvr_remove(struct device *dev) netdev_info(priv->dev, "%s: removing driver", __func__); pm_runtime_get_sync(dev); - pm_runtime_disable(dev); - pm_runtime_put_noidle(dev); stmmac_stop_all_dma(priv); stmmac_mac_set(priv, priv->ioaddr, false); @@ -7241,6 +7239,9 @@ int stmmac_dvr_remove(struct device *dev) mutex_destroy(&priv->lock); bitmap_free(priv->af_xdp_zc_qps); + pm_runtime_disable(dev); + pm_runtime_put_noidle(dev); + return 0; } EXPORT_SYMBOL_GPL(stmmac_dvr_remove); From f4c7d8948e866918d61493264dbbd67e45ef2bda Mon Sep 17 00:00:00 2001 From: Biao Huang Date: Thu, 14 Jul 2022 14:00:14 +0800 Subject: [PATCH 140/248] net: stmmac: fix unbalanced ptp clock issue in suspend/resume flow Current stmmac driver will prepare/enable ptp_ref clock in stmmac_init_tstamp_counter(). The stmmac_pltfr_noirq_suspend will disable it once in suspend flow. But in resume flow, stmmac_pltfr_noirq_resume --> stmmac_init_tstamp_counter stmmac_resume --> stmmac_hw_setup --> stmmac_init_ptp --> stmmac_init_tstamp_counter ptp_ref clock reference counter increases twice, which leads to unbalance ptp clock when resume back. Move ptp_ref clock prepare/enable out of stmmac_init_tstamp_counter to fix it. Fixes: 0735e639f129d ("net: stmmac: skip only stmmac_ptp_register when resume from suspend") Signed-off-by: Biao Huang Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 17 ++++++++--------- .../ethernet/stmicro/stmmac/stmmac_platform.c | 8 +++++++- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 197fac587ad5..c5f33630e771 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -834,19 +834,10 @@ int stmmac_init_tstamp_counter(struct stmmac_priv *priv, u32 systime_flags) struct timespec64 now; u32 sec_inc = 0; u64 temp = 0; - int ret; if (!(priv->dma_cap.time_stamp || priv->dma_cap.atime_stamp)) return -EOPNOTSUPP; - ret = clk_prepare_enable(priv->plat->clk_ptp_ref); - if (ret < 0) { - netdev_warn(priv->dev, - "failed to enable PTP reference clock: %pe\n", - ERR_PTR(ret)); - return ret; - } - stmmac_config_hw_tstamping(priv, priv->ptpaddr, systime_flags); priv->systime_flags = systime_flags; @@ -3270,6 +3261,14 @@ static int stmmac_hw_setup(struct net_device *dev, bool ptp_register) stmmac_mmc_setup(priv); + if (ptp_register) { + ret = clk_prepare_enable(priv->plat->clk_ptp_ref); + if (ret < 0) + netdev_warn(priv->dev, + "failed to enable PTP reference clock: %pe\n", + ERR_PTR(ret)); + } + ret = stmmac_init_ptp(priv); if (ret == -EOPNOTSUPP) netdev_info(priv->dev, "PTP not supported by HW\n"); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 11e1055e8260..9f5cac4000da 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -815,7 +815,13 @@ static int __maybe_unused stmmac_pltfr_noirq_resume(struct device *dev) if (ret) return ret; - stmmac_init_tstamp_counter(priv, priv->systime_flags); + ret = clk_prepare_enable(priv->plat->clk_ptp_ref); + if (ret < 0) { + netdev_warn(priv->dev, + "failed to enable PTP reference clock: %pe\n", + ERR_PTR(ret)); + return ret; + } } return 0; From 79629181607e801c0b41b8790ac4ee2eb5d7bc3e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 15 Jul 2022 07:34:55 -0400 Subject: [PATCH 141/248] KVM: emulate: do not adjust size of fastop and setcc subroutines Instead of doing complicated calculations to find the size of the subroutines (which are even more complicated because they need to be stringified into an asm statement), just hardcode to 16. It is less dense for a few combinations of IBT/SLS/retbleed, but it has the advantage of being really simple. Cc: stable@vger.kernel.org # 5.15.x: 84e7051c0bc1: x86/kvm: fix FASTOP_SIZE when return thunks are enabled Cc: stable@vger.kernel.org Suggested-by: Linus Torvalds Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0a15b0fec6d9..f8382abe22ff 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -189,13 +189,6 @@ #define X8(x...) X4(x), X4(x) #define X16(x...) X8(x), X8(x) -#define NR_FASTOP (ilog2(sizeof(ulong)) + 1) -#define RET_LENGTH (1 + (4 * IS_ENABLED(CONFIG_RETHUNK)) + \ - IS_ENABLED(CONFIG_SLS)) -#define FASTOP_LENGTH (ENDBR_INSN_SIZE + 7 + RET_LENGTH) -#define FASTOP_SIZE (8 << ((FASTOP_LENGTH > 8) & 1) << ((FASTOP_LENGTH > 16) & 1)) -static_assert(FASTOP_LENGTH <= FASTOP_SIZE); - struct opcode { u64 flags; u8 intercept; @@ -310,9 +303,15 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for * different operand sizes can be reached by calculation, rather than a jump * table (which would be bigger than the code). + * + * The 16 byte alignment, considering 5 bytes for the RET thunk, 3 for ENDBR + * and 1 for the straight line speculation INT3, leaves 7 bytes for the + * body of the function. Currently none is larger than 4. */ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); +#define FASTOP_SIZE 16 + #define __FOP_FUNC(name) \ ".align " __stringify(FASTOP_SIZE) " \n\t" \ ".type " name ", @function \n\t" \ @@ -446,9 +445,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); * RET | JMP __x86_return_thunk [1,5 bytes; CONFIG_RETHUNK] * INT3 [1 byte; CONFIG_SLS] */ -#define SETCC_LENGTH (ENDBR_INSN_SIZE + 3 + RET_LENGTH) -#define SETCC_ALIGN (4 << ((SETCC_LENGTH > 4) & 1) << ((SETCC_LENGTH > 8) & 1)) -static_assert(SETCC_LENGTH <= SETCC_ALIGN); +#define SETCC_ALIGN 16 #define FOP_SETCC(op) \ ".align " __stringify(SETCC_ALIGN) " \n\t" \ From fc7cbcd4890e297de5d6487e04344a99b39de9be Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 15 Jul 2022 13:59:21 +0200 Subject: [PATCH 142/248] Revert "btrfs: turn fs_roots_radix in btrfs_fs_info into an XArray" This reverts commit 48b36a602a335c184505346b5b37077840660634. Revert the xarray conversion, there's a problem with potential sleep-inside-spinlock [1] when calling xa_insert that triggers GFP_NOFS allocation. The radix tree used the preloading mechanism to avoid sleeping but this is not available in xarray. Conversion from spin lock to mutex is possible but at time of rc6 is riskier than a clean revert. [1] https://lore.kernel.org/linux-btrfs/cover.1657097693.git.fdmanana@suse.com/ Reported-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 8 +- fs/btrfs/disk-io.c | 179 +++++++++++++++++++---------------- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/inode.c | 13 ++- fs/btrfs/tests/btrfs-tests.c | 2 +- fs/btrfs/transaction.c | 104 +++++++++++--------- 6 files changed, 170 insertions(+), 138 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 415bf1823fb3..e0d3cf2ec0dd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -675,9 +675,8 @@ struct btrfs_fs_info { rwlock_t global_root_lock; struct rb_root global_root_tree; - /* The xarray that holds all the FS roots */ - spinlock_t fs_roots_lock; - struct xarray fs_roots; + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; /* block group cache stuff */ rwlock_t block_group_cache_lock; @@ -1119,8 +1118,7 @@ enum { */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, - /* The root is tracked in fs_info::fs_roots */ - BTRFS_ROOT_REGISTERED, + BTRFS_ROOT_IN_RADIX, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d92cc7893610..88ba485b155b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -1210,9 +1211,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&root->leak_list); - spin_lock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); list_add_tail(&root->leak_list, &fs_info->allocated_roots); - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); #endif } @@ -1659,11 +1660,12 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, { struct btrfs_root *root; - spin_lock(&fs_info->fs_roots_lock); - root = xa_load(&fs_info->fs_roots, (unsigned long)root_id); + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_id); if (root) root = btrfs_grab_root(root); - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1705,14 +1707,20 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - spin_lock(&fs_info->fs_roots_lock); - ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid, - root, GFP_NOFS); + ret = radix_tree_preload(GFP_NOFS); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); if (ret == 0) { btrfs_grab_root(root); - set_bit(BTRFS_ROOT_REGISTERED, &root->state); + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); return ret; } @@ -2342,9 +2350,9 @@ void btrfs_put_root(struct btrfs_root *root) btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG - spin_lock(&root->fs_info->fs_roots_lock); + spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); - spin_unlock(&root->fs_info->fs_roots_lock); + spin_unlock(&root->fs_info->fs_roots_radix_lock); #endif kfree(root); } @@ -2352,21 +2360,28 @@ void btrfs_put_root(struct btrfs_root *root) void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root; - unsigned long index = 0; + int ret; + struct btrfs_root *gang[8]; + int i; while (!list_empty(&fs_info->dead_roots)) { - root = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&root->root_list); + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); - if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) - btrfs_drop_and_free_fs_root(fs_info, root); - btrfs_put_root(root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) + btrfs_drop_and_free_fs_root(fs_info, gang[0]); + btrfs_put_root(gang[0]); } - xa_for_each(&fs_info->fs_roots, index, root) { - btrfs_drop_and_free_fs_root(fs_info, root); + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_drop_and_free_fs_root(fs_info, gang[i]); } } @@ -3134,7 +3149,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); @@ -3143,7 +3158,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->fs_roots_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); @@ -3374,7 +3389,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load - * them into the fs_info->fs_roots. This must be done before + * them into the fs_info->fs_roots_radix tree. This must be done before * calling btrfs_orphan_cleanup() on the tree root. If we don't do it * first, then btrfs_orphan_cleanup() will delete a dead root's orphan * item before the root's tree is deleted - this means that if we unmount @@ -4498,11 +4513,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, { bool drop_ref = false; - spin_lock(&fs_info->fs_roots_lock); - xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid); - if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state)) + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); @@ -4518,48 +4534,50 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { - struct btrfs_root *roots[8]; - unsigned long index = 0; - int i; + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; int err = 0; - int grabbed; + unsigned int ret = 0; while (1) { - struct btrfs_root *root; - - spin_lock(&fs_info->fs_roots_lock); - if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) { - spin_unlock(&fs_info->fs_roots_lock); - return err; + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + spin_unlock(&fs_info->fs_roots_radix_lock); + break; } + root_objectid = gang[ret - 1]->root_key.objectid + 1; - grabbed = 0; - xa_for_each_start(&fs_info->fs_roots, index, root, index) { - /* Avoid grabbing roots in dead_roots */ - if (btrfs_root_refs(&root->root_item) > 0) - roots[grabbed++] = btrfs_grab_root(root); - if (grabbed >= ARRAY_SIZE(roots)) - break; - } - spin_unlock(&fs_info->fs_roots_lock); - - for (i = 0; i < grabbed; i++) { - if (!roots[i]) + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; continue; - index = roots[i]->root_key.objectid; - err = btrfs_orphan_cleanup(roots[i]); - if (err) - goto out; - btrfs_put_root(roots[i]); + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_root(gang[i]); } - index++; + spin_unlock(&fs_info->fs_roots_radix_lock); + + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); + if (err) + break; + btrfs_put_root(gang[i]); + } + root_objectid++; } -out: - /* Release the roots that remain uncleaned due to error */ - for (; i < grabbed; i++) { - if (roots[i]) - btrfs_put_root(roots[i]); + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_root(gang[i]); } return err; } @@ -4878,28 +4896,31 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) { - unsigned long index = 0; - int grabbed = 0; - struct btrfs_root *roots[8]; + struct btrfs_root *gang[8]; + u64 root_objectid = 0; + int ret; - spin_lock(&fs_info->fs_roots_lock); - while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index, - ULONG_MAX, 8, XA_PRESENT))) { - for (int i = 0; i < grabbed; i++) - roots[i] = btrfs_grab_root(roots[i]); - spin_unlock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); + while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang))) != 0) { + int i; - for (int i = 0; i < grabbed; i++) { - if (!roots[i]) + for (i = 0; i < ret; i++) + gang[i] = btrfs_grab_root(gang[i]); + spin_unlock(&fs_info->fs_roots_radix_lock); + + for (i = 0; i < ret; i++) { + if (!gang[i]) continue; - index = roots[i]->root_key.objectid; - btrfs_free_log(NULL, roots[i]); - btrfs_put_root(roots[i]); + root_objectid = gang[i]->root_key.objectid; + btrfs_free_log(NULL, gang[i]); + btrfs_put_root(gang[i]); } - index++; - spin_lock(&fs_info->fs_roots_lock); + root_objectid++; + spin_lock(&fs_info->fs_roots_radix_lock); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log_root_tree(NULL, fs_info); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4515497d8a29..e14f61ecc189 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5829,7 +5829,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_qgroup_convert_reserved_meta(root, INT_MAX); btrfs_qgroup_free_meta_all_pertrans(root); - if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); else btrfs_put_root(root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 00c9ad053a7a..306673ef7256 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3578,7 +3578,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; - /* Bail out if the cleanup is already running. */ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; @@ -3661,17 +3660,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into - * fs_info->fs_roots. So here we can find if an + * fs_info->fs_roots_radix. So here we can find if an * orphan item corresponds to a deleted root by looking - * up the root from that xarray. + * up the root from that radix tree. */ - spin_lock(&fs_info->fs_roots_lock); - dead_root = xa_load(&fs_info->fs_roots, - (unsigned long)found_key.objectid); + spin_lock(&fs_info->fs_roots_radix_lock); + dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (is_dead_root) { /* prevent this orphan from being found again */ diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 1591bfa55bcc..c8c4efc9a3fb 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -186,7 +186,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) if (!root) return; /* Will be freed by btrfs_free_fs_roots */ - if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state))) + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; btrfs_global_root_delete(root); btrfs_put_root(root); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 06c0a958d114..875b801ab3d7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,7 +23,7 @@ #include "space-info.h" #include "zoned.h" -#define BTRFS_ROOT_TRANS_TAG XA_MARK_0 +#define BTRFS_ROOT_TRANS_TAG 0 /* * Transaction states and transitions @@ -437,15 +437,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, */ smp_wmb(); - spin_lock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid && !force) { - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } - xa_set_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_lock); + radix_tree_tag_set(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); root->last_trans = trans->transid; /* this is pretty tricky. We don't want to @@ -487,9 +487,11 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, spin_unlock(&cur_trans->dropped_roots_lock); /* Make sure we don't try to update the root at commit time */ - xa_clear_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); } int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, @@ -1402,8 +1404,9 @@ void btrfs_add_dead_root(struct btrfs_root *root) static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root; - unsigned long index; + struct btrfs_root *gang[8]; + int i; + int ret; /* * At this point no one can be using this transaction to modify any tree @@ -1411,46 +1414,57 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); - spin_lock(&fs_info->fs_roots_lock); - xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) { - int ret; + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + struct btrfs_root *root = gang[i]; + int ret2; - /* - * At this point we can neither have tasks logging inodes - * from a root nor trying to commit a log tree. - */ - ASSERT(atomic_read(&root->log_writers) == 0); - ASSERT(atomic_read(&root->log_commit[0]) == 0); - ASSERT(atomic_read(&root->log_commit[1]) == 0); + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); - xa_clear_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_lock); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); - btrfs_free_log(trans, root); - ret = btrfs_update_reloc_root(trans, root); - if (ret) - return ret; + btrfs_free_log(trans, root); + ret2 = btrfs_update_reloc_root(trans, root); + if (ret2) + return ret2; - /* See comments in should_cow_block() */ - clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_mb__after_atomic(); + /* see comments in should_cow_block() */ + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); - if (root->commit_root != root->node) { - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); - btrfs_set_root_node(&root->root_item, root->node); + if (root->commit_root != root->node) { + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + btrfs_set_root_node(&root->root_item, + root->node); + } + + ret2 = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, + &root->root_item); + if (ret2) + return ret2; + spin_lock(&fs_info->fs_roots_radix_lock); + btrfs_qgroup_free_meta_all_pertrans(root); } - - ret = btrfs_update_root(trans, fs_info->tree_root, - &root->root_key, &root->root_item); - if (ret) - return ret; - spin_lock(&fs_info->fs_roots_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } From 01cd390903e00c8f42ba0e84f25a70e3d613a15c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 15 Jul 2022 13:59:31 +0200 Subject: [PATCH 143/248] Revert "btrfs: turn fs_info member buffer_radix into XArray" This reverts commit 8ee922689d67b7cfa6acbe2aa1ee76ac72e6fc8a. Revert the xarray conversion, there's a problem with potential sleep-inside-spinlock [1] when calling xa_insert that triggers GFP_NOFS allocation. The radix tree used the preloading mechanism to avoid sleeping but this is not available in xarray. Conversion from spin lock to mutex is possible but at time of rc6 is riskier than a clean revert. [1] https://lore.kernel.org/linux-btrfs/cover.1657097693.git.fdmanana@suse.com/ Reported-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 +- fs/btrfs/disk-io.c | 4 +- fs/btrfs/extent_io.c | 120 +++++++++++++++++++++-------------- fs/btrfs/tests/btrfs-tests.c | 22 ++++++- 4 files changed, 96 insertions(+), 54 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e0d3cf2ec0dd..a240e8b83709 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -994,10 +994,10 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer xarray */ + /* Extent buffer radix tree */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct xarray extent_buffers; + struct radix_tree_root buffer_radix; /* next backup root to be overwritten */ int backup_root_index; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 88ba485b155b..f142ab43df36 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -486,7 +486,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, fs_info->nodesize); - /* A dirty eb shouldn't disappear from extent_buffers */ + /* A dirty eb shouldn't disappear from buffer_radix */ if (WARN_ON(!eb)) return -EUCLEAN; @@ -3150,7 +3150,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9c250b8cd548..31729b1be5f3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2966,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * Find extent buffer for a given bytenr. + * Find extent buffer for a givne bytenr. * * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking * in endio context. @@ -2985,9 +2985,11 @@ static struct extent_buffer *find_extent_buffer_readpage( return (struct extent_buffer *)page->private; } - /* For subpage case, we need to lookup extent buffer xarray */ - eb = xa_load(&fs_info->extent_buffers, - bytenr >> fs_info->sectorsize_bits); + /* For subpage case, we need to lookup buffer radix tree */ + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + bytenr >> fs_info->sectorsize_bits); + rcu_read_unlock(); ASSERT(eb); return eb; } @@ -4434,8 +4436,8 @@ static struct extent_buffer *find_extent_buffer_nolock( struct extent_buffer *eb; rcu_read_lock(); - eb = xa_load(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); return eb; @@ -6128,22 +6130,24 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; - - do { - ret = xa_insert(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits, - eb, GFP_NOFS); - if (ret == -ENOMEM) { - exists = ERR_PTR(ret); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) goto free_eb; - } - if (ret == -EBUSY) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - } - } while (ret); - + else + goto again; + } check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6318,22 +6322,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } - do { - ret = xa_insert(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits, - eb, GFP_NOFS); - if (ret == -ENOMEM) { - exists = ERR_PTR(ret); + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) goto free_eb; - } - if (ret == -EBUSY) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - } - } while (ret); - + else + goto again; + } /* add one reference for the tree */ check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6378,8 +6385,10 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); - xa_erase(&fs_info->extent_buffers, - eb->start >> fs_info->sectorsize_bits); + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, + eb->start >> fs_info->sectorsize_bits); + spin_unlock(&fs_info->buffer_lock); } else { spin_unlock(&eb->refs_lock); } @@ -7324,25 +7333,42 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *eb; - unsigned long index; + struct extent_buffer *gang[GANG_LOOKUP_SIZE]; + struct extent_buffer *found = NULL; u64 page_start = page_offset(page); + u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - xa_for_each_start(&fs_info->extent_buffers, index, eb, - page_start >> fs_info->sectorsize_bits) { - if (in_range(eb->start, page_start, PAGE_SIZE)) - return eb; - else if (eb->start >= page_start + PAGE_SIZE) - /* Already beyond page end */ - return NULL; + while (cur < page_start + PAGE_SIZE) { + int ret; + int i; + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, + (void **)gang, cur >> fs_info->sectorsize_bits, + min_t(unsigned int, GANG_LOOKUP_SIZE, + PAGE_SIZE / fs_info->nodesize)); + if (ret == 0) + goto out; + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + goto out; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + goto out; + } + } + cur = gang[ret - 1]->start + gang[ret - 1]->len; } - return NULL; +out: + return found; } static int try_release_subpage_extent_buffer(struct page *page) diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index c8c4efc9a3fb..d8e56edd6991 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - unsigned long index; - struct extent_buffer *eb; + struct radix_tree_iter iter; + void **slot; struct btrfs_device *dev, *tmp; if (!fs_info) @@ -163,9 +163,25 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - xa_for_each(&fs_info->extent_buffers, index, eb) { + spin_lock(&fs_info->buffer_lock); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + slot = radix_tree_iter_retry(&iter); + continue; + } + slot = radix_tree_iter_resume(slot, &iter); + spin_unlock(&fs_info->buffer_lock); free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); } + spin_unlock(&fs_info->buffer_lock); btrfs_mapping_tree_free(&fs_info->mapping_tree); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, From 5b8418b84303d9a0a0f7f28d6eaed915247ebdc3 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 15 Jul 2022 13:59:38 +0200 Subject: [PATCH 144/248] Revert "btrfs: turn name_cache radix tree into XArray in send_ctx" This reverts commit 4076942021fe14efecae33bf98566df6dd5ae6f7. Revert the xarray conversion, there's a problem with potential sleep-inside-spinlock [1] when calling xa_insert that triggers GFP_NOFS allocation. The radix tree used the preloading mechanism to avoid sleeping but this is not available in xarray. Conversion from spin lock to mutex is possible but at time of rc6 is riskier than a clean revert. [1] https://lore.kernel.org/linux-btrfs/cover.1657097693.git.fdmanana@suse.com/ Reported-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5a05beabf0c3..d6a2428fe22f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -127,7 +128,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct xarray name_cache; + struct radix_tree_root name_cache; struct list_head name_cache_list; int name_cache_size; @@ -268,13 +269,14 @@ struct orphan_dir_info { struct name_cache_entry { struct list_head list; /* - * On 32bit kernels, xarray has only 32bit indices, but we need to - * handle 64bit inums. We use the lower 32bit of the 64bit inum to store - * it in the tree. If more than one inum would fall into the same entry, - * we use inum_aliases to store the additional entries. inum_aliases is - * also used to store entries with the same inum but different generations. + * radix_tree has only 32bit entries but we need to handle 64bit inums. + * We use the lower 32bit of the 64bit inum to store it in the tree. If + * more then one inum would fall into the same entry, we use radix_list + * to store the additional entries. radix_list is also used to store + * entries where two entries have the same inum but different + * generations. */ - struct list_head inum_aliases; + struct list_head radix_list; u64 ino; u64 gen; u64 parent_ino; @@ -2024,9 +2026,9 @@ out: } /* - * Insert a name cache entry. On 32bit kernels the xarray index is 32bit, + * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::inum_aliases. + * takes care of this with the help of name_cache_entry::radix_list. * In case of error, nce is kfreed. */ static int name_cache_insert(struct send_ctx *sctx, @@ -2035,7 +2037,8 @@ static int name_cache_insert(struct send_ctx *sctx, int ret = 0; struct list_head *nce_head; - nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); if (!nce_head) { @@ -2044,14 +2047,14 @@ static int name_cache_insert(struct send_ctx *sctx, } INIT_LIST_HEAD(nce_head); - ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL); + ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); if (ret < 0) { kfree(nce_head); kfree(nce); return ret; } } - list_add_tail(&nce->inum_aliases, nce_head); + list_add_tail(&nce->radix_list, nce_head); list_add_tail(&nce->list, &sctx->name_cache_list); sctx->name_cache_size++; @@ -2063,14 +2066,15 @@ static void name_cache_delete(struct send_ctx *sctx, { struct list_head *nce_head; - nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); if (!nce_head) { btrfs_err(sctx->send_root->fs_info, "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", nce->ino, sctx->name_cache_size); } - list_del(&nce->inum_aliases); + list_del(&nce->radix_list); list_del(&nce->list); sctx->name_cache_size--; @@ -2078,7 +2082,7 @@ static void name_cache_delete(struct send_ctx *sctx, * We may not get to the final release of nce_head if the lookup fails */ if (nce_head && list_empty(nce_head)) { - xa_erase(&sctx->name_cache, (unsigned long)nce->ino); + radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } } @@ -2089,11 +2093,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, struct list_head *nce_head; struct name_cache_entry *cur; - nce_head = xa_load(&sctx->name_cache, (unsigned long)ino); + nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); if (!nce_head) return NULL; - list_for_each_entry(cur, nce_head, inum_aliases) { + list_for_each_entry(cur, nce_head, radix_list) { if (cur->ino == ino && cur->gen == gen) return cur; } @@ -7518,7 +7522,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - xa_init_flags(&sctx->name_cache, GFP_KERNEL); + INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); sctx->flags = arg->flags; From 088aea3b97e0ae5a2a86f5d142ad10fec8a1b80f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 15 Jul 2022 13:59:45 +0200 Subject: [PATCH 145/248] Revert "btrfs: turn delayed_nodes_tree into an XArray" This reverts commit 253bf57555e451dec5a7f09dc95d380ce8b10e5b. Revert the xarray conversion, there's a problem with potential sleep-inside-spinlock [1] when calling xa_insert that triggers GFP_NOFS allocation. The radix tree used the preloading mechanism to avoid sleeping but this is not available in xarray. Conversion from spin lock to mutex is possible but at time of rc6 is riskier than a clean revert. [1] https://lore.kernel.org/linux-btrfs/cover.1657097693.git.fdmanana@suse.com/ Reported-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 +-- fs/btrfs/delayed-inode.c | 84 +++++++++++++++++++++------------------- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 2 +- 4 files changed, 50 insertions(+), 44 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a240e8b83709..9c21e214d29e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1222,10 +1222,10 @@ struct btrfs_root { struct rb_root inode_tree; /* - * Xarray that keeps track of delayed nodes of every inode, protected - * by inode_lock + * radix tree that keeps track of delayed nodes of every inode, + * protected by inode_lock */ - struct xarray delayed_nodes; + struct radix_tree_root delayed_nodes_tree; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 66779ab3ed4a..748bf6b0d860 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = xa_load(&root->delayed_nodes, ino); + node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the xarray. In this case, the refcount + * this node from the radix tree. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the xarray at all; our release + * NULL like it was never in the radix at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the xarray, we want to bump the + * If this node is properly in the radix, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -128,30 +128,36 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( u64 ino = btrfs_ino(btrfs_inode); int ret; - do { - node = btrfs_get_delayed_node(btrfs_inode); - if (node) - return node; +again: + node = btrfs_get_delayed_node(btrfs_inode); + if (node) + return node; - node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); - if (!node) - return ERR_PTR(-ENOMEM); - btrfs_init_delayed_node(node, root, ino); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); - /* Cached in the inode and can be accessed */ - refcount_set(&node->refs, 2); + /* cached in the btrfs inode and can be accessed */ + refcount_set(&node->refs, 2); - spin_lock(&root->inode_lock); - ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS); - if (ret) { - spin_unlock(&root->inode_lock); - kmem_cache_free(delayed_node_cache, node); - if (ret != -EBUSY) - return ERR_PTR(ret); - } - } while (ret); + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + kmem_cache_free(delayed_node_cache, node); + return ERR_PTR(ret); + } + + spin_lock(&root->inode_lock); + ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); + if (ret == -EEXIST) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + radix_tree_preload_end(); + goto again; + } btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); + radix_tree_preload_end(); return node; } @@ -270,7 +276,8 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - xa_erase(&root->delayed_nodes, delayed_node->inode_id); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -1863,35 +1870,34 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - unsigned long index = 0; - struct btrfs_delayed_node *delayed_node; + u64 inode_id = 0; struct btrfs_delayed_node *delayed_nodes[8]; + int i, n; while (1) { - int n = 0; - spin_lock(&root->inode_lock); - if (xa_empty(&root->delayed_nodes)) { + n = radix_tree_gang_lookup(&root->delayed_nodes_tree, + (void **)delayed_nodes, inode_id, + ARRAY_SIZE(delayed_nodes)); + if (!n) { spin_unlock(&root->inode_lock); - return; + break; } - xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) { + inode_id = delayed_nodes[n - 1]->inode_id + 1; + for (i = 0; i < n; i++) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (refcount_inc_not_zero(&delayed_node->refs)) { - delayed_nodes[n] = delayed_node; - n++; - } - if (n >= ARRAY_SIZE(delayed_nodes)) - break; + if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) + delayed_nodes[i] = NULL; } - index++; spin_unlock(&root->inode_lock); - for (int i = 0; i < n; i++) { + for (i = 0; i < n; i++) { + if (!delayed_nodes[i]) + continue; __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f142ab43df36..daa756b3606d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1159,7 +1159,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); + INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); btrfs_init_root_block_rsv(root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 306673ef7256..329ad27f0918 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3910,7 +3910,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in the delayed_nodes xarray. + * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, From a14bd7475452c51835dd5a0cee4c8fa48dd0b539 Mon Sep 17 00:00:00 2001 From: Liang He Date: Thu, 14 Jul 2022 23:31:38 +0800 Subject: [PATCH 146/248] net: dsa: microchip: ksz_common: Fix refcount leak bug In ksz_switch_register(), we should call of_node_put() for the reference returned by of_get_child_by_name() which has increased the refcount. Fixes: 912aae27c6af ("net: dsa: microchip: really look for phy-mode in port nodes") Signed-off-by: Liang He Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220714153138.375919-1-windhl@126.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 9ca8c8d7740f..92a500e1ccd2 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -1038,18 +1038,21 @@ int ksz_switch_register(struct ksz_device *dev, ports = of_get_child_by_name(dev->dev->of_node, "ethernet-ports"); if (!ports) ports = of_get_child_by_name(dev->dev->of_node, "ports"); - if (ports) + if (ports) { for_each_available_child_of_node(ports, port) { if (of_property_read_u32(port, "reg", &port_num)) continue; if (!(dev->port_mask & BIT(port_num))) { of_node_put(port); + of_node_put(ports); return -EINVAL; } of_get_phy_mode(port, &dev->ports[port_num].interface); } + of_node_put(ports); + } dev->synclko_125 = of_property_read_bool(dev->dev->of_node, "microchip,synclko-125"); dev->synclko_disable = of_property_read_bool(dev->dev->of_node, From 11052589cf5c0bab3b4884d423d5f60c38fcf25d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 13 Jul 2022 10:52:07 -0700 Subject: [PATCH 147/248] tcp/udp: Make early_demux back namespacified. Commit e21145a9871a ("ipv4: namespacify ip_early_demux sysctl knob") made it possible to enable/disable early_demux on a per-netns basis. Then, we introduced two knobs, tcp_early_demux and udp_early_demux, to switch it for TCP/UDP in commit dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp"). However, the .proc_handler() was wrong and actually disabled us from changing the behaviour in each netns. We can execute early_demux if net.ipv4.ip_early_demux is on and each proto .early_demux() handler is not NULL. When we toggle (tcp|udp)_early_demux, the change itself is saved in each netns variable, but the .early_demux() handler is a global variable, so the handler is switched based on the init_net's sysctl variable. Thus, netns (tcp|udp)_early_demux knobs have nothing to do with the logic. Whether we CAN execute proto .early_demux() is always decided by init_net's sysctl knob, and whether we DO it or not is by each netns ip_early_demux knob. This patch namespacifies (tcp|udp)_early_demux again. For now, the users of the .early_demux() handler are TCP and UDP only, and they are called directly to avoid retpoline. So, we can remove the .early_demux() handler from inet6?_protos and need not dereference them in ip6?_rcv_finish_core(). If another proto needs .early_demux(), we can restore it at that time. Fixes: dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220713175207.7727-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/protocol.h | 4 --- include/net/tcp.h | 2 +- include/net/udp.h | 2 +- net/ipv4/af_inet.c | 14 ++------- net/ipv4/ip_input.c | 35 +++++++++++++--------- net/ipv4/sysctl_net_ipv4.c | 59 ++------------------------------------ net/ipv6/ip6_input.c | 23 ++++++++------- net/ipv6/tcp_ipv6.c | 9 ++---- net/ipv6/udp.c | 9 ++---- 9 files changed, 44 insertions(+), 113 deletions(-) diff --git a/include/net/protocol.h b/include/net/protocol.h index f51c06ae365f..6aef8cb11cc8 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -35,8 +35,6 @@ /* This is used to register protocols. */ struct net_protocol { - int (*early_demux)(struct sk_buff *skb); - int (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); /* This returns an error if we weren't able to handle the error. */ @@ -52,8 +50,6 @@ struct net_protocol { #if IS_ENABLED(CONFIG_IPV6) struct inet6_protocol { - void (*early_demux)(struct sk_buff *skb); - void (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); /* This returns an error if we weren't able to handle the error. */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 1e99f5c61f84..1636c55e798b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -932,7 +932,7 @@ extern const struct inet_connection_sock_af_ops ipv6_specific; INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); -INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *skb)); +void tcp_v6_early_demux(struct sk_buff *skb); #endif diff --git a/include/net/udp.h b/include/net/udp.h index b83a00330566..bb4c227299cc 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -167,7 +167,7 @@ static inline void udp_csum_pull_header(struct sk_buff *skb) typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport, __be16 dport); -INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *)); +void udp_v6_early_demux(struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4bc24f9e38b3..8baef2f3fc4b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1710,24 +1710,14 @@ static const struct net_protocol igmp_protocol = { }; #endif -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct net_protocol tcp_protocol = { - .early_demux = tcp_v4_early_demux, - .early_demux_handler = tcp_v4_early_demux, +static const struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, .icmp_strict_tag_validation = 1, }; -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct net_protocol udp_protocol = { - .early_demux = udp_v4_early_demux, - .early_demux_handler = udp_v4_early_demux, +static const struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b1165f717cd1..1b512390b3cf 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -312,14 +312,13 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, ip_hdr(hint)->tos == iph->tos; } -INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); +int tcp_v4_early_demux(struct sk_buff *skb); +int udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); - int (*edemux)(struct sk_buff *skb); int err, drop_reason; struct rtable *rt; @@ -332,21 +331,29 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, goto drop_error; } - if (net->ipv4.sysctl_ip_early_demux && + if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { - const struct net_protocol *ipprot; - int protocol = iph->protocol; + switch (iph->protocol) { + case IPPROTO_TCP: + if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { + tcp_v4_early_demux(skb); - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux, - udp_v4_early_demux, skb); - if (unlikely(err)) - goto drop_error; - /* must reload iph, skb->head might have changed */ - iph = ip_hdr(skb); + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } + break; + case IPPROTO_UDP: + if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { + err = udp_v4_early_demux(skb); + if (unlikely(err)) + goto drop_error; + + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } + break; } } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 108fd86f2718..130e9c130311 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -350,61 +350,6 @@ bad_key: return ret; } -static void proc_configure_early_demux(int enabled, int protocol) -{ - struct net_protocol *ipprot; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_protocol *ip6prot; -#endif - - rcu_read_lock(); - - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot) - ipprot->early_demux = enabled ? ipprot->early_demux_handler : - NULL; - -#if IS_ENABLED(CONFIG_IPV6) - ip6prot = rcu_dereference(inet6_protos[protocol]); - if (ip6prot) - ip6prot->early_demux = enabled ? ip6prot->early_demux_handler : - NULL; -#endif - rcu_read_unlock(); -} - -static int proc_tcp_early_demux(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret = 0; - - ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); - - if (write && !ret) { - int enabled = init_net.ipv4.sysctl_tcp_early_demux; - - proc_configure_early_demux(enabled, IPPROTO_TCP); - } - - return ret; -} - -static int proc_udp_early_demux(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret = 0; - - ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); - - if (write && !ret) { - int enabled = init_net.ipv4.sysctl_udp_early_demux; - - proc_configure_early_demux(enabled, IPPROTO_UDP); - } - - return ret; -} - static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -707,14 +652,14 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_udp_early_demux, .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_udp_early_demux + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_demux", .data = &init_net.ipv4.sysctl_tcp_early_demux, .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_tcp_early_demux + .proc_handler = proc_dou8vec_minmax, }, { .procname = "nexthop_compat_mode", diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 0322cc86b84e..e1ebf5e42ebe 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -45,20 +45,23 @@ #include #include -INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *)); static void ip6_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb) { - void (*edemux)(struct sk_buff *skb); - - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { - const struct inet6_protocol *ipprot; - - ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); - if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) - INDIRECT_CALL_2(edemux, tcp_v6_early_demux, - udp_v6_early_demux, skb); + if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && + !skb_dst(skb) && !skb->sk) { + switch (ipv6_hdr(skb)->nexthdr) { + case IPPROTO_TCP: + if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) + tcp_v6_early_demux(skb); + break; + case IPPROTO_UDP: + if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) + udp_v6_early_demux(skb); + break; + } } + if (!skb_valid_dst(skb)) ip6_route_input(skb); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f37dd4aa91c6..9d3ede293258 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1822,7 +1822,7 @@ do_time_wait: goto discard_it; } -INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) +void tcp_v6_early_demux(struct sk_buff *skb) { const struct ipv6hdr *hdr; const struct tcphdr *th; @@ -2176,12 +2176,7 @@ struct proto tcpv6_prot = { }; EXPORT_SYMBOL_GPL(tcpv6_prot); -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct inet6_protocol tcpv6_protocol = { - .early_demux = tcp_v6_early_demux, - .early_demux_handler = tcp_v6_early_demux, +static const struct inet6_protocol tcpv6_protocol = { .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 55afd7f39c04..e2f2e087a753 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1052,7 +1052,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, return NULL; } -INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) +void udp_v6_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct udphdr *uh; @@ -1660,12 +1660,7 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname, return ipv6_getsockopt(sk, level, optname, optval, optlen); } -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct inet6_protocol udpv6_protocol = { - .early_demux = udp_v6_early_demux, - .early_demux_handler = udp_v6_early_demux, +static const struct inet6_protocol udpv6_protocol = { .handler = udpv6_rcv, .err_handler = udpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, From bcf163150cd37348a0cb59e95c916a83a9344b0e Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Fri, 8 Jul 2022 16:21:28 -0500 Subject: [PATCH 148/248] x86/bugs: Remove apostrophe typo Remove a superfluous ' in the mitigation string. Fixes: e8ec1b6e08a2 ("x86/bugs: Enable STIBP for JMP2RET") Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 3a0787a36910..aa34f908c39f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1181,7 +1181,7 @@ spectre_v2_user_select_mitigation(void) if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) { if (mode != SPECTRE_V2_USER_STRICT && mode != SPECTRE_V2_USER_STRICT_PREFERRED) - pr_info("Selecting STIBP always-on mode to complement retbleed mitigation'\n"); + pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); mode = SPECTRE_V2_USER_STRICT_PREFERRED; } From 51a6fa0732d6be6a44e0032752ad2ac10d67c796 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 15 Jul 2022 16:45:50 -0300 Subject: [PATCH 149/248] efi/x86: use naked RET on mixed mode call wrapper When running with return thunks enabled under 32-bit EFI, the system crashes with: kernel tried to execute NX-protected page - exploit attempt? (uid: 0) BUG: unable to handle page fault for address: 000000005bc02900 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0011) - permissions violation PGD 18f7063 P4D 18f7063 PUD 18ff063 PMD 190e063 PTE 800000005bc02063 Oops: 0011 [#1] PREEMPT SMP PTI CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.19.0-rc6+ #166 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:0x5bc02900 Code: Unable to access opcode bytes at RIP 0x5bc028d6. RSP: 0018:ffffffffb3203e10 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000048 RDX: 000000000190dfac RSI: 0000000000001710 RDI: 000000007eae823b RBP: ffffffffb3203e70 R08: 0000000001970000 R09: ffffffffb3203e28 R10: 747563657865206c R11: 6c6977203a696665 R12: 0000000000001710 R13: 0000000000000030 R14: 0000000001970000 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8e013ca00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 0000000080050033 CR2: 000000005bc02900 CR3: 0000000001930000 CR4: 00000000000006f0 Call Trace: ? efi_set_virtual_address_map+0x9c/0x175 efi_enter_virtual_mode+0x4a6/0x53e start_kernel+0x67c/0x71e x86_64_start_reservations+0x24/0x2a x86_64_start_kernel+0xe9/0xf4 secondary_startup_64_no_verify+0xe5/0xeb That's because it cannot jump to the return thunk from the 32-bit code. Using a naked RET and marking it as safe allows the system to proceed booting. Fixes: aa3d480315ba ("x86: Use return-thunk in asm code") Reported-by: Guenter Roeck Signed-off-by: Thadeu Lima de Souza Cascardo Cc: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Tested-by: Guenter Roeck Signed-off-by: Linus Torvalds --- arch/x86/platform/efi/efi_thunk_64.S | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 9ffe2bad27d5..4e5257a4811b 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -23,6 +23,7 @@ #include #include #include +#include .text .code64 @@ -75,7 +76,9 @@ STACK_FRAME_NON_STANDARD __efi64_thunk 1: movq 0x20(%rsp), %rsp pop %rbx pop %rbp - RET + ANNOTATE_UNRET_SAFE + ret + int3 .code32 2: pushl $__KERNEL_CS From 829d680e82a961c5370d9636130b43009ac36eb8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 13 Jul 2022 17:11:15 +0200 Subject: [PATCH 150/248] random: cap jitter samples per bit to factor of HZ Currently the jitter mechanism will require two timer ticks per iteration, and it requires N iterations per bit. This N is determined with a small measurement, and if it's too big, it won't waste time with jitter entropy because it'd take too long or not have sufficient entropy anyway. With the current max N of 32, there are large timeouts on systems with a small CONFIG_HZ. Rather than set that maximum to 32, instead choose a factor of CONFIG_HZ. In this case, 1/30 seems to yield sane values for different configurations of CONFIG_HZ. Reported-by: Vladimir Murzin Fixes: 78c768e619fb ("random: vary jitter iterations based on cycle counter speed") Signed-off-by: Jason A. Donenfeld Tested-by: Vladimir Murzin Signed-off-by: Linus Torvalds --- drivers/char/random.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index e3dd1dd3dd22..a1af90bacc9f 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1174,7 +1174,7 @@ static void __cold entropy_timer(struct timer_list *timer) */ static void __cold try_to_generate_entropy(void) { - enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = 32 }; + enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 30 }; struct entropy_timer_state stack; unsigned int i, num_different = 0; unsigned long last = random_get_entropy(); From eee51fe38e372b89317f3950d2dc3e3ea7bace12 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:39:02 -0300 Subject: [PATCH 151/248] tools headers UAPI: Sync linux/kvm.h with the kernel sources To pick the changes in: 1b870fa5573e260b ("kvm: stats: tell userspace which values are boolean") That just rebuilds perf, as these patches don't add any new KVM ioctl to be harvested for the the 'perf trace' ioctl syscall argument beautifiers. This is also by now used by tools/testing/selftests/kvm/, a simple test build succeeded. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Link: http://lore.kernel.org/lkml/YtQLDvQrBhJNl3n5@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 5088bd9f1922..811897dadcae 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -2083,6 +2083,7 @@ struct kvm_stats_header { #define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES #define KVM_STATS_BASE_SHIFT 8 From f098addbdb44c8a565367f5162f3ab170ed9404a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:39:15 -0300 Subject: [PATCH 152/248] tools headers cpufeatures: Sync with the kernel sources To pick the changes from: f43b9876e857c739 ("x86/retbleed: Add fine grained Kconfig knobs") a149180fbcf336e9 ("x86: Add magic AMD return-thunk") 15e67227c49a5783 ("x86: Undo return-thunk damage") 369ae6ffc41a3c11 ("x86/retpoline: Cleanup some #ifdefery") 4ad3278df6fe2b08 x86/speculation: Disable RRSBA behavior 26aae8ccbc197223 x86/cpu/amd: Enumerate BTC_NO 9756bba28470722d x86/speculation: Fill RSB on vmexit for IBRS 3ebc170068885b6f x86/bugs: Add retbleed=ibpb 2dbb887e875b1de3 x86/entry: Add kernel IBRS implementation 6b80b59b35557065 x86/bugs: Report AMD retbleed vulnerability a149180fbcf336e9 x86: Add magic AMD return-thunk 15e67227c49a5783 x86: Undo return-thunk damage a883d624aed463c8 x86/cpufeatures: Move RETPOLINE flags to word 11 51802186158c74a0 x86/speculation/mmio: Enumerate Processor MMIO Stale Data bug This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Warning: Kernel ABI header at 'tools/arch/x86/include/asm/disabled-features.h' differs from latest version at 'arch/x86/include/asm/disabled-features.h' diff -u tools/arch/x86/include/asm/disabled-features.h arch/x86/include/asm/disabled-features.h Cc: Adrian Hunter Cc: Borislav Petkov Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra --- tools/arch/x86/include/asm/cpufeatures.h | 12 +++++++++-- .../arch/x86/include/asm/disabled-features.h | 21 ++++++++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 03acc823838a..00f5227c8459 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -203,8 +203,8 @@ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ @@ -296,6 +296,12 @@ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ #define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ #define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ +#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ +#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ +#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -316,6 +322,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ #define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ @@ -447,5 +454,6 @@ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ +#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 36369e76cc63..33d2cd04d254 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -50,6 +50,25 @@ # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif +#ifdef CONFIG_RETPOLINE +# define DISABLE_RETPOLINE 0 +#else +# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ + (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) +#endif + +#ifdef CONFIG_RETHUNK +# define DISABLE_RETHUNK 0 +#else +# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) +#endif + +#ifdef CONFIG_CPU_UNRET_ENTRY +# define DISABLE_UNRET 0 +#else +# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -82,7 +101,7 @@ #define DISABLED_MASK8 (DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 -#define DISABLED_MASK11 0 +#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) #define DISABLED_MASK12 0 #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 From 91d248c3b903b46a58cbc7e8d38d684d3e4007c2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:32:18 -0300 Subject: [PATCH 153/248] tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes from these csets: 4ad3278df6fe2b08 ("x86/speculation: Disable RRSBA behavior") d7caac991feeef1b ("x86/cpu/amd: Add Spectral Chicken") That cause no changes to tooling: $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after $ diff -u before after $ Just silences this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h' diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Cc: Adrian Hunter Cc: Borislav Petkov Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Pawan Gupta Cc: Peter Zijlstra Link: https://lore.kernel.org/lkml/YtQTm9wsB3hxQWvy@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 2eab6a3a8a8c..cc615be27a54 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -95,6 +95,7 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ +#define ARCH_CAP_RSBA BIT(2) /* RET may use alternative branch predictors */ #define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ #define ARCH_CAP_SSB_NO BIT(4) /* * Not susceptible to Speculative Store Bypass @@ -576,6 +577,9 @@ /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) + /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 From 498c7a54f169b2699104d3060604d840424f15d2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 13 Jul 2022 15:34:58 +0300 Subject: [PATCH 154/248] perf tests: Stop Convert perf time to TSC test opening events twice Do not call evlist__open() twice. Fixes: 5bb017d4b97a0f13 ("perf test: Fix error message for test case 71 on s390, where it is not supported") Reviewed-by: Kan Liang Signed-off-by: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Thomas Richter Link: https://lore.kernel.org/r/20220713123459.24145-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/perf-time-to-tsc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c index 4ad0dfbc8b21..8d6d60173693 100644 --- a/tools/perf/tests/perf-time-to-tsc.c +++ b/tools/perf/tests/perf-time-to-tsc.c @@ -123,11 +123,14 @@ static int test__perf_time_to_tsc(struct test_suite *test __maybe_unused, int su evsel->core.attr.enable_on_exec = 0; } - if (evlist__open(evlist) == -ENOENT) { - err = TEST_SKIP; + ret = evlist__open(evlist); + if (ret < 0) { + if (ret == -ENOENT) + err = TEST_SKIP; + else + pr_debug("evlist__open() failed\n"); goto out_err; } - CHECK__(evlist__open(evlist)); CHECK__(evlist__mmap(evlist, UINT_MAX)); From deb44a6249f696106645c63c0603eab08a6122af Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 13 Jul 2022 15:34:59 +0300 Subject: [PATCH 155/248] perf tests: Fix Convert perf time to TSC test for hybrid The test does not always correctly determine the number of events for hybrids, nor allow for more than 1 evsel when parsing. Fix by iterating the events actually created and getting the correct evsel for the events processed. Fixes: d9da6f70eb235110 ("perf tests: Support 'Convert perf time to TSC' test for hybrid") Reviewed-by: Kan Liang Signed-off-by: Adrian Hunter Cc: Ian Rogers Cc: Jin Yao Cc: Jiri Olsa Cc: Namhyung Kim Cc: Thomas Richter Link: https://lore.kernel.org/r/20220713123459.24145-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/perf-time-to-tsc.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c index 8d6d60173693..7c7d20fc503a 100644 --- a/tools/perf/tests/perf-time-to-tsc.c +++ b/tools/perf/tests/perf-time-to-tsc.c @@ -20,8 +20,6 @@ #include "tsc.h" #include "mmap.h" #include "tests.h" -#include "pmu.h" -#include "pmu-hybrid.h" /* * Except x86_64/i386 and Arm64, other archs don't support TSC in perf. Just @@ -106,18 +104,8 @@ static int test__perf_time_to_tsc(struct test_suite *test __maybe_unused, int su evlist__config(evlist, &opts, NULL); - evsel = evlist__first(evlist); - - evsel->core.attr.comm = 1; - evsel->core.attr.disabled = 1; - evsel->core.attr.enable_on_exec = 0; - - /* - * For hybrid "cycles:u", it creates two events. - * Init the second evsel here. - */ - if (perf_pmu__has_hybrid() && perf_pmu__hybrid_mounted("cpu_atom")) { - evsel = evsel__next(evsel); + /* For hybrid "cycles:u", it creates two events */ + evlist__for_each_entry(evlist, evsel) { evsel->core.attr.comm = 1; evsel->core.attr.disabled = 1; evsel->core.attr.enable_on_exec = 0; @@ -170,10 +158,12 @@ static int test__perf_time_to_tsc(struct test_suite *test __maybe_unused, int su goto next_event; if (strcmp(event->comm.comm, comm1) == 0) { + CHECK_NOT_NULL__(evsel = evlist__event2evsel(evlist, event)); CHECK__(evsel__parse_sample(evsel, event, &sample)); comm1_time = sample.time; } if (strcmp(event->comm.comm, comm2) == 0) { + CHECK_NOT_NULL__(evsel = evlist__event2evsel(evlist, event)); CHECK__(evsel__parse_sample(evsel, event, &sample)); comm2_time = sample.time; } From 4b335e1e0d6f8fa91dac615a44b123c9f26e93d3 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 7 Jul 2022 14:39:00 +0530 Subject: [PATCH 156/248] perf trace: Fix SIGSEGV when processing syscall args On powerpc, 'perf trace' is crashing with a SIGSEGV when trying to process a perf.data file created with 'perf trace record -p': #0 0x00000001225b8988 in syscall_arg__scnprintf_augmented_string at builtin-trace.c:1492 #1 syscall_arg__scnprintf_filename at builtin-trace.c:1492 #2 syscall_arg__scnprintf_filename at builtin-trace.c:1486 #3 0x00000001225bdd9c in syscall_arg_fmt__scnprintf_val at builtin-trace.c:1973 #4 syscall__scnprintf_args at builtin-trace.c:2041 #5 0x00000001225bff04 in trace__sys_enter at builtin-trace.c:2319 That points to the below code in tools/perf/builtin-trace.c: /* * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible * arguments, even if the syscall being handled, say "openat", uses only 4 arguments * this breaks syscall__augmented_args() check for augmented args, as we calculate * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file, * so when handling, say the openat syscall, we end up getting 6 args for the * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly * thinking that the extra 2 u64 args are the augmented filename, so just check * here and avoid using augmented syscalls when the evsel is the raw_syscalls one. */ if (evsel != trace->syscalls.events.sys_enter) augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size); As the comment points out, we should not be trying to augment the args for raw_syscalls. However, when processing a perf.data file, we are not initializing those properly. Fix the same. Reported-by: Claudio Carvalho Signed-off-by: Naveen N. Rao Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lore.kernel.org/lkml/20220707090900.572584-1-naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 897fc504918b..f075cf37a65e 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -4280,6 +4280,7 @@ static int trace__replay(struct trace *trace) goto out; evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter"); + trace->syscalls.events.sys_enter = evsel; /* older kernels have syscalls tp versus raw_syscalls */ if (evsel == NULL) evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter"); @@ -4292,6 +4293,7 @@ static int trace__replay(struct trace *trace) } evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit"); + trace->syscalls.events.sys_exit = evsel; if (evsel == NULL) evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit"); if (evsel && From ced7866db39fc5c59ee05e154d4abc0977a17f6b Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Tue, 12 Jul 2022 18:40:50 +0100 Subject: [PATCH 157/248] drm/i915/ttm: fix 32b build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since segment_pages is no longer a compile time constant, it looks the DIV_ROUND_UP(node->size, segment_pages) breaks the 32b build. Simplest is just to use the ULL variant, but really we should need not need more than u32 for the page alignment (also we are limited by that due to the sg->length type), so also make it all u32. Reported-by: Ville Syrjälä Fixes: aff1e0b09b54 ("drm/i915/ttm: fix sg_table construction") Signed-off-by: Matthew Auld Cc: Nirmoy Das Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20220712174050.592550-1-matthew.auld@intel.com (cherry picked from commit 9306b2b2dfce6931241ef804783692cee526599c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gem/i915_gem_region.c | 2 ++ drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +- drivers/gpu/drm/i915/i915_scatterlist.c | 16 ++++++++-------- drivers/gpu/drm/i915/i915_scatterlist.h | 4 ++-- drivers/gpu/drm/i915/intel_region_ttm.c | 2 +- drivers/gpu/drm/i915/intel_region_ttm.h | 2 +- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_region.c b/drivers/gpu/drm/i915/gem/i915_gem_region.c index f46ee16a323a..a4fb577eceb4 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_region.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_region.c @@ -60,6 +60,8 @@ __i915_gem_object_create_region(struct intel_memory_region *mem, if (page_size) default_page_size = page_size; + /* We should be able to fit a page within an sg entry */ + GEM_BUG_ON(overflows_type(default_page_size, u32)); GEM_BUG_ON(!is_power_of_2_u64(default_page_size)); GEM_BUG_ON(default_page_size < PAGE_SIZE); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index d30ebcaec8b9..8f1bb6a4b7d1 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -620,7 +620,7 @@ i915_ttm_resource_get_st(struct drm_i915_gem_object *obj, struct ttm_resource *res) { struct ttm_buffer_object *bo = i915_gem_to_ttm(obj); - u64 page_alignment; + u32 page_alignment; if (!i915_ttm_gtt_binds_lmem(res)) return i915_ttm_tt_get_st(bo->ttm); diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c index f63b50b71e10..dcc081874ec8 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.c +++ b/drivers/gpu/drm/i915/i915_scatterlist.c @@ -79,10 +79,10 @@ void i915_refct_sgt_init(struct i915_refct_sgt *rsgt, size_t size) */ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, u64 region_start, - u64 page_alignment) + u32 page_alignment) { - const u64 max_segment = round_down(UINT_MAX, page_alignment); - u64 segment_pages = max_segment >> PAGE_SHIFT; + const u32 max_segment = round_down(UINT_MAX, page_alignment); + const u32 segment_pages = max_segment >> PAGE_SHIFT; u64 block_size, offset, prev_end; struct i915_refct_sgt *rsgt; struct sg_table *st; @@ -96,7 +96,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, i915_refct_sgt_init(rsgt, node->size << PAGE_SHIFT); st = &rsgt->table; - if (sg_alloc_table(st, DIV_ROUND_UP(node->size, segment_pages), + if (sg_alloc_table(st, DIV_ROUND_UP_ULL(node->size, segment_pages), GFP_KERNEL)) { i915_refct_sgt_put(rsgt); return ERR_PTR(-ENOMEM); @@ -123,7 +123,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, st->nents++; } - len = min(block_size, max_segment - sg->length); + len = min_t(u64, block_size, max_segment - sg->length); sg->length += len; sg_dma_len(sg) += len; @@ -155,11 +155,11 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, */ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, u64 region_start, - u64 page_alignment) + u32 page_alignment) { struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(res); const u64 size = res->num_pages << PAGE_SHIFT; - const u64 max_segment = round_down(UINT_MAX, page_alignment); + const u32 max_segment = round_down(UINT_MAX, page_alignment); struct drm_buddy *mm = bman_res->mm; struct list_head *blocks = &bman_res->blocks; struct drm_buddy_block *block; @@ -207,7 +207,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, st->nents++; } - len = min(block_size, max_segment - sg->length); + len = min_t(u64, block_size, max_segment - sg->length); sg->length += len; sg_dma_len(sg) += len; diff --git a/drivers/gpu/drm/i915/i915_scatterlist.h b/drivers/gpu/drm/i915/i915_scatterlist.h index b13e4cdea923..9ddb3e743a3e 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.h +++ b/drivers/gpu/drm/i915/i915_scatterlist.h @@ -214,10 +214,10 @@ void i915_refct_sgt_init(struct i915_refct_sgt *rsgt, size_t size); struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, u64 region_start, - u64 page_alignment); + u32 page_alignment); struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, u64 region_start, - u64 page_alignment); + u32 page_alignment); #endif diff --git a/drivers/gpu/drm/i915/intel_region_ttm.c b/drivers/gpu/drm/i915/intel_region_ttm.c index 6873808a7015..575d67bc6ffe 100644 --- a/drivers/gpu/drm/i915/intel_region_ttm.c +++ b/drivers/gpu/drm/i915/intel_region_ttm.c @@ -163,7 +163,7 @@ int intel_region_ttm_fini(struct intel_memory_region *mem) struct i915_refct_sgt * intel_region_ttm_resource_to_rsgt(struct intel_memory_region *mem, struct ttm_resource *res, - u64 page_alignment) + u32 page_alignment) { if (mem->is_range_manager) { struct ttm_range_mgr_node *range_node = diff --git a/drivers/gpu/drm/i915/intel_region_ttm.h b/drivers/gpu/drm/i915/intel_region_ttm.h index 98fba5155619..5bb8d8b582ae 100644 --- a/drivers/gpu/drm/i915/intel_region_ttm.h +++ b/drivers/gpu/drm/i915/intel_region_ttm.h @@ -25,7 +25,7 @@ int intel_region_ttm_fini(struct intel_memory_region *mem); struct i915_refct_sgt * intel_region_ttm_resource_to_rsgt(struct intel_memory_region *mem, struct ttm_resource *res, - u64 page_alignment); + u32 page_alignment); void intel_region_ttm_resource_free(struct intel_memory_region *mem, struct ttm_resource *res); From ff6992735ade75aae3e35d16b17da1008d753d28 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 Jul 2022 13:30:22 -0700 Subject: [PATCH 158/248] Linux 5.19-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index faa4880f25f7..00fd80c5dd6e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 19 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Superb Owl # *DOCUMENTATION* From 76c16d3e19446deea98b7883f261758b96b8781a Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Thu, 14 Jul 2022 15:54:27 +0800 Subject: [PATCH 159/248] net: stmmac: switch to use interrupt for hw crosstimestamping Using current implementation of polling mode, there is high chances we will hit into timeout error when running phc2sys. Hence, update the implementation of hardware crosstimestamping to use the MAC interrupt service routine instead of polling for TSIS bit in the MAC Timestamp Interrupt Status register to be set. Cc: Richard Cochran Signed-off-by: Wong Vee Khee Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/dwmac-intel.c | 25 ++++++++++++------- drivers/net/ethernet/stmicro/stmmac/dwmac4.h | 3 ++- .../net/ethernet/stmicro/stmmac/dwmac4_core.c | 4 +++ drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 + .../ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 5 ++++ .../net/ethernet/stmicro/stmmac/stmmac_ptp.c | 12 +-------- include/linux/stmmac.h | 1 + 7 files changed, 30 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 38fe77d1035e..3fe720c5dc9f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -298,6 +298,11 @@ static void get_arttime(struct mii_bus *mii, int intel_adhoc_addr, *art_time = ns; } +static int stmmac_cross_ts_isr(struct stmmac_priv *priv) +{ + return (readl(priv->ioaddr + GMAC_INT_STATUS) & GMAC_INT_TSIE); +} + static int intel_crosststamp(ktime_t *device, struct system_counterval_t *system, void *ctx) @@ -313,8 +318,6 @@ static int intel_crosststamp(ktime_t *device, u32 num_snapshot; u32 gpio_value; u32 acr_value; - int ret; - u32 v; int i; if (!boot_cpu_has(X86_FEATURE_ART)) @@ -328,6 +331,8 @@ static int intel_crosststamp(ktime_t *device, if (priv->plat->ext_snapshot_en) return -EBUSY; + priv->plat->int_snapshot_en = 1; + mutex_lock(&priv->aux_ts_lock); /* Enable Internal snapshot trigger */ acr_value = readl(ptpaddr + PTP_ACR); @@ -347,6 +352,7 @@ static int intel_crosststamp(ktime_t *device, break; default: mutex_unlock(&priv->aux_ts_lock); + priv->plat->int_snapshot_en = 0; return -EINVAL; } writel(acr_value, ptpaddr + PTP_ACR); @@ -368,13 +374,12 @@ static int intel_crosststamp(ktime_t *device, gpio_value |= GMAC_GPO1; writel(gpio_value, ioaddr + GMAC_GPIO_STATUS); - /* Poll for time sync operation done */ - ret = readl_poll_timeout(priv->ioaddr + GMAC_INT_STATUS, v, - (v & GMAC_INT_TSIE), 100, 10000); - - if (ret == -ETIMEDOUT) { - pr_err("%s: Wait for time sync operation timeout\n", __func__); - return ret; + /* Time sync done Indication - Interrupt method */ + if (!wait_event_interruptible_timeout(priv->tstamp_busy_wait, + stmmac_cross_ts_isr(priv), + HZ / 100)) { + priv->plat->int_snapshot_en = 0; + return -ETIMEDOUT; } num_snapshot = (readl(ioaddr + GMAC_TIMESTAMP_STATUS) & @@ -392,6 +397,7 @@ static int intel_crosststamp(ktime_t *device, } system->cycles *= intel_priv->crossts_adj; + priv->plat->int_snapshot_en = 0; return 0; } @@ -576,6 +582,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->has_crossts = true; plat->crosststamp = intel_crosststamp; + plat->int_snapshot_en = 0; /* Setup MSI vector offset specific to Intel mGbE controller */ plat->msi_mac_vec = 29; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index 462ca7ed095a..71dad409f78b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -150,7 +150,8 @@ #define GMAC_PCS_IRQ_DEFAULT (GMAC_INT_RGSMIIS | GMAC_INT_PCS_LINK | \ GMAC_INT_PCS_ANE) -#define GMAC_INT_DEFAULT_ENABLE (GMAC_INT_PMT_EN | GMAC_INT_LPI_EN) +#define GMAC_INT_DEFAULT_ENABLE (GMAC_INT_PMT_EN | GMAC_INT_LPI_EN | \ + GMAC_INT_TSIE) enum dwmac4_irq_status { time_stamp_irq = 0x00001000, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index fd41db65fe1d..d5299dd13e85 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -23,6 +23,7 @@ static void dwmac4_core_init(struct mac_device_info *hw, struct net_device *dev) { + struct stmmac_priv *priv = netdev_priv(dev); void __iomem *ioaddr = hw->pcsr; u32 value = readl(ioaddr + GMAC_CONFIG); @@ -58,6 +59,9 @@ static void dwmac4_core_init(struct mac_device_info *hw, value |= GMAC_INT_FPE_EN; writel(value, ioaddr + GMAC_INT_EN); + + if (GMAC_INT_DEFAULT_ENABLE & GMAC_INT_TSIE) + init_waitqueue_head(&priv->tstamp_busy_wait); } static void dwmac4_rx_queue_enable(struct mac_device_info *hw, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 57970ae2178d..f9e83964aa7e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -266,6 +266,7 @@ struct stmmac_priv { rwlock_t ptp_lock; /* Protects auxiliary snapshot registers from concurrent access. */ struct mutex aux_ts_lock; + wait_queue_head_t tstamp_busy_wait; void __iomem *mmcaddr; void __iomem *ptpaddr; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index 92d32940aff0..764832f4dae1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -179,6 +179,11 @@ static void timestamp_interrupt(struct stmmac_priv *priv) u64 ptp_time; int i; + if (priv->plat->int_snapshot_en) { + wake_up(&priv->tstamp_busy_wait); + return; + } + tsync_int = readl(priv->ioaddr + GMAC_INT_STATUS) & GMAC_INT_TSIE; if (!tsync_int) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index e45fb191d8e6..4d11980dcd64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -175,11 +175,10 @@ static int stmmac_enable(struct ptp_clock_info *ptp, struct stmmac_priv *priv = container_of(ptp, struct stmmac_priv, ptp_clock_ops); void __iomem *ptpaddr = priv->ptpaddr; - void __iomem *ioaddr = priv->hw->pcsr; struct stmmac_pps_cfg *cfg; - u32 intr_value, acr_value; int ret = -EOPNOTSUPP; unsigned long flags; + u32 acr_value; switch (rq->type) { case PTP_CLK_REQ_PEROUT: @@ -213,19 +212,10 @@ static int stmmac_enable(struct ptp_clock_info *ptp, netdev_dbg(priv->dev, "Auxiliary Snapshot %d enabled.\n", priv->plat->ext_snapshot_num >> PTP_ACR_ATSEN_SHIFT); - /* Enable Timestamp Interrupt */ - intr_value = readl(ioaddr + GMAC_INT_EN); - intr_value |= GMAC_INT_TSIE; - writel(intr_value, ioaddr + GMAC_INT_EN); - } else { netdev_dbg(priv->dev, "Auxiliary Snapshot %d disabled.\n", priv->plat->ext_snapshot_num >> PTP_ACR_ATSEN_SHIFT); - /* Disable Timestamp Interrupt */ - intr_value = readl(ioaddr + GMAC_INT_EN); - intr_value &= ~GMAC_INT_TSIE; - writel(intr_value, ioaddr + GMAC_INT_EN); } writel(acr_value, ptpaddr + PTP_ACR); mutex_unlock(&priv->aux_ts_lock); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 29917850f079..8df475db88c0 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -260,6 +260,7 @@ struct plat_stmmacenet_data { bool has_crossts; int int_snapshot_num; int ext_snapshot_num; + bool int_snapshot_en; bool ext_snapshot_en; bool multi_msi_en; int msi_mac_vec; From 613b065ca32e90209024ec4a6bb5ca887ee70980 Mon Sep 17 00:00:00 2001 From: Junxiao Chang Date: Fri, 15 Jul 2022 15:47:01 +0800 Subject: [PATCH 160/248] net: stmmac: fix dma queue left shift overflow issue When queue number is > 4, left shift overflows due to 32 bits integer variable. Mask calculation is wrong for MTL_RXQ_DMA_MAP1. If CONFIG_UBSAN is enabled, kernel dumps below warning: [ 10.363842] ================================================================== [ 10.363882] UBSAN: shift-out-of-bounds in /build/linux-intel-iotg-5.15-8e6Tf4/ linux-intel-iotg-5.15-5.15.0/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c:224:12 [ 10.363929] shift exponent 40 is too large for 32-bit type 'unsigned int' [ 10.363953] CPU: 1 PID: 599 Comm: NetworkManager Not tainted 5.15.0-1003-intel-iotg [ 10.363956] Hardware name: ADLINK Technology Inc. LEC-EL/LEC-EL, BIOS 0.15.11 12/22/2021 [ 10.363958] Call Trace: [ 10.363960] [ 10.363963] dump_stack_lvl+0x4a/0x5f [ 10.363971] dump_stack+0x10/0x12 [ 10.363974] ubsan_epilogue+0x9/0x45 [ 10.363976] __ubsan_handle_shift_out_of_bounds.cold+0x61/0x10e [ 10.363979] ? wake_up_klogd+0x4a/0x50 [ 10.363983] ? vprintk_emit+0x8f/0x240 [ 10.363986] dwmac4_map_mtl_dma.cold+0x42/0x91 [stmmac] [ 10.364001] stmmac_mtl_configuration+0x1ce/0x7a0 [stmmac] [ 10.364009] ? dwmac410_dma_init_channel+0x70/0x70 [stmmac] [ 10.364020] stmmac_hw_setup.cold+0xf/0xb14 [stmmac] [ 10.364030] ? page_pool_alloc_pages+0x4d/0x70 [ 10.364034] ? stmmac_clear_tx_descriptors+0x6e/0xe0 [stmmac] [ 10.364042] stmmac_open+0x39e/0x920 [stmmac] [ 10.364050] __dev_open+0xf0/0x1a0 [ 10.364054] __dev_change_flags+0x188/0x1f0 [ 10.364057] dev_change_flags+0x26/0x60 [ 10.364059] do_setlink+0x908/0xc40 [ 10.364062] ? do_setlink+0xb10/0xc40 [ 10.364064] ? __nla_validate_parse+0x4c/0x1a0 [ 10.364068] __rtnl_newlink+0x597/0xa10 [ 10.364072] ? __nla_reserve+0x41/0x50 [ 10.364074] ? __kmalloc_node_track_caller+0x1d0/0x4d0 [ 10.364079] ? pskb_expand_head+0x75/0x310 [ 10.364082] ? nla_reserve_64bit+0x21/0x40 [ 10.364086] ? skb_free_head+0x65/0x80 [ 10.364089] ? security_sock_rcv_skb+0x2c/0x50 [ 10.364094] ? __cond_resched+0x19/0x30 [ 10.364097] ? kmem_cache_alloc_trace+0x15a/0x420 [ 10.364100] rtnl_newlink+0x49/0x70 This change fixes MTL_RXQ_DMA_MAP1 mask issue and channel/queue mapping warning. Fixes: d43042f4da3e ("net: stmmac: mapping mtl rx to dma channel") BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=216195 Reported-by: Cedric Wassenaar Signed-off-by: Junxiao Chang Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index d5299dd13e85..d8f1fbc25bdd 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -223,6 +223,9 @@ static void dwmac4_map_mtl_dma(struct mac_device_info *hw, u32 queue, u32 chan) if (queue == 0 || queue == 4) { value &= ~MTL_RXQ_DMA_Q04MDMACH_MASK; value |= MTL_RXQ_DMA_Q04MDMACH(chan); + } else if (queue > 4) { + value &= ~MTL_RXQ_DMA_QXMDMACH_MASK(queue - 4); + value |= MTL_RXQ_DMA_QXMDMACH(chan, queue - 4); } else { value &= ~MTL_RXQ_DMA_QXMDMACH_MASK(queue); value |= MTL_RXQ_DMA_QXMDMACH(chan, queue); From f08d8c1bb97c48f24a82afaa2fd8c140f8d3da8b Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Fri, 15 Jul 2022 11:42:16 +0300 Subject: [PATCH 161/248] net/tls: Fix race in TLS device down flow Socket destruction flow and tls_device_down function sync against each other using tls_device_lock and the context refcount, to guarantee the device resources are freed via tls_dev_del() by the end of tls_device_down. In the following unfortunate flow, this won't happen: - refcount is decreased to zero in tls_device_sk_destruct. - tls_device_down starts, skips the context as refcount is zero, going all the way until it flushes the gc work, and returns without freeing the device resources. - only then, tls_device_queue_ctx_destruction is called, queues the gc work and frees the context's device resources. Solve it by decreasing the refcount in the socket's destruction flow under the tls_device_lock, for perfect synchronization. This does not slow down the common likely destructor flow, in which both the refcount is decreased and the spinlock is acquired, anyway. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Reviewed-by: Maxim Mikityanskiy Signed-off-by: Tariq Toukan Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_device.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index ce827e79c66a..879b9024678e 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -97,13 +97,16 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx) unsigned long flags; spin_lock_irqsave(&tls_device_lock, flags); + if (unlikely(!refcount_dec_and_test(&ctx->refcount))) + goto unlock; + list_move_tail(&ctx->list, &tls_device_gc_list); /* schedule_work inside the spinlock * to make sure tls_device_down waits for that work. */ schedule_work(&tls_device_gc_work); - +unlock: spin_unlock_irqrestore(&tls_device_lock, flags); } @@ -194,8 +197,7 @@ void tls_device_sk_destruct(struct sock *sk) clean_acked_data_disable(inet_csk(sk)); } - if (refcount_dec_and_test(&tls_ctx->refcount)) - tls_device_queue_ctx_destruction(tls_ctx); + tls_device_queue_ctx_destruction(tls_ctx); } EXPORT_SYMBOL_GPL(tls_device_sk_destruct); From 1e20904e417738066b26490de2daf7ef3ed34483 Mon Sep 17 00:00:00 2001 From: Maksym Glubokiy Date: Fri, 15 Jul 2022 15:55:50 +0300 Subject: [PATCH 162/248] net: prestera: acl: use proper mask for port selector Adjusted as per packet processor documentation. This allows to properly match 'indev' for clsact rules. Fixes: 47327e198d42 ("net: prestera: acl: migrate to new vTCAM api") Signed-off-by: Maksym Glubokiy Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/prestera/prestera_flower.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_flower.c b/drivers/net/ethernet/marvell/prestera/prestera_flower.c index d43e503c644f..4d93ad6a284c 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_flower.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_flower.c @@ -167,12 +167,12 @@ static int prestera_flower_parse_meta(struct prestera_acl_rule *rule, } port = netdev_priv(ingress_dev); - mask = htons(0x1FFF); - key = htons(port->hw_id); + mask = htons(0x1FFF << 3); + key = htons(port->hw_id << 3); rule_match_set(r_match->key, SYS_PORT, key); rule_match_set(r_match->mask, SYS_PORT, mask); - mask = htons(0x1FF); + mask = htons(0x3FF); key = htons(port->dev_id); rule_match_set(r_match->key, SYS_DEV, key); rule_match_set(r_match->mask, SYS_DEV, mask); From f6da2267e71106474fbc0943dc24928b9cb79119 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:41 -0700 Subject: [PATCH 163/248] igmp: Fix data-races around sysctl_igmp_llm_reports. While reading sysctl_igmp_llm_reports, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. This test can be packed into a helper, so such changes will be in the follow-up series after net is merged into net-next. if (ipv4_is_local_multicast(pmc->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) Fixes: df2cf4a78e48 ("IGMP: Inhibit reports for local multicast groups") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index b65d074d9620..cf75fff170e4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -467,7 +467,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; - if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(pmc->multiaddr) && + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return skb; mtu = READ_ONCE(dev->mtu); @@ -593,7 +594,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(pmc->multiaddr) && - !net->ipv4.sysctl_igmp_llm_reports) + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) @@ -736,7 +737,8 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return 0; if (type == IGMP_HOST_LEAVE_MESSAGE) @@ -920,7 +922,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) if (group == IGMP_ALL_HOSTS) return false; - if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return false; rcu_read_lock(); @@ -1045,7 +1048,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !net->ipv4.sysctl_igmp_llm_reports) + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&im->lock); if (im->tm_running) @@ -1296,7 +1299,8 @@ static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; reporter = im->reporter; @@ -1338,7 +1342,8 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; if (in_dev->dead) @@ -1642,7 +1647,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !net->ipv4.sysctl_igmp_llm_reports) + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; /* a failover is happening and switches From 6305d821e3b9b5379d348528e5b5faf316383bc2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:42 -0700 Subject: [PATCH 164/248] igmp: Fix a data-race around sysctl_igmp_max_memberships. While reading sysctl_igmp_max_memberships, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index cf75fff170e4..792ea1b56b9e 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2197,7 +2197,7 @@ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, count++; } err = -ENOBUFS; - if (count >= net->ipv4.sysctl_igmp_max_memberships) + if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships)) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (!iml) From 6ae0f2e553737b8cce49a1372573c81130ffa80e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:43 -0700 Subject: [PATCH 165/248] igmp: Fix data-races around sysctl_igmp_max_msf. While reading sysctl_igmp_max_msf, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 2 +- net/ipv4/ip_sockglue.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 792ea1b56b9e..cd7839db34da 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2384,7 +2384,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct } /* else, add a new source to the filter */ - if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) { + if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { err = -ENOBUFS; goto done; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d497d525dea3..a8a323ecbb54 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -782,7 +782,7 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) /* numsrc >= (4G-140)/128 overflow in 32 bits */ err = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + gsf->gf_numsrc > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf)) goto out_free_gsf; err = -EINVAL; @@ -832,7 +832,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, /* numsrc >= (4G-140)/128 overflow in 32 bits */ err = -ENOBUFS; - if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + if (n > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf)) goto out_free_gsf; err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode, &gf32->gf_group, gf32->gf_slist_flex); @@ -1244,7 +1244,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, } /* numsrc >= (1G-4) overflow in 32 bits */ if (msf->imsf_numsrc >= 0x3ffffffcU || - msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) { + msf->imsf_numsrc > READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { kfree(msf); err = -ENOBUFS; break; From 8ebcc62c738f68688ee7c6fec2efe5bc6d3d7e60 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:44 -0700 Subject: [PATCH 166/248] igmp: Fix data-races around sysctl_igmp_qrv. While reading sysctl_igmp_qrv, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. This test can be packed into a helper, so such changes will be in the follow-up series after net is merged into net-next. qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); Fixes: a9fe8e29945d ("ipv4: implement igmp_qrv sysctl to tune igmp robustness variable") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- drivers/net/amt.c | 4 ++-- net/ipv4/igmp.c | 24 +++++++++++++----------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index be2719a3ba70..89563d1b2a3b 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -563,7 +563,7 @@ static struct sk_buff *amt_build_igmp_gq(struct amt_dev *amt) ihv3->nsrcs = 0; ihv3->resv = 0; ihv3->suppress = false; - ihv3->qrv = amt->net->ipv4.sysctl_igmp_qrv; + ihv3->qrv = READ_ONCE(amt->net->ipv4.sysctl_igmp_qrv); ihv3->csum = 0; csum = &ihv3->csum; csum_start = (void *)ihv3; @@ -3095,7 +3095,7 @@ static int amt_newlink(struct net *net, struct net_device *dev, goto err; } if (amt->mode == AMT_MODE_RELAY) { - amt->qrv = amt->net->ipv4.sysctl_igmp_qrv; + amt->qrv = READ_ONCE(amt->net->ipv4.sysctl_igmp_qrv); amt->qri = 10; dev->needed_headroom = amt->stream_dev->needed_headroom + AMT_RELAY_HLEN; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index cd7839db34da..e3ab0cb61624 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -827,7 +827,7 @@ static void igmp_ifc_event(struct in_device *in_dev) struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv); + WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv)); igmp_ifc_start_timer(in_dev, 1); } @@ -1009,7 +1009,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, * received value was zero, use the default or statically * configured value. */ - in_dev->mr_qrv = ih3->qrv ?: net->ipv4.sysctl_igmp_qrv; + in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; /* RFC3376, 8.3. Query Response Interval: @@ -1189,7 +1189,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1240,9 +1240,11 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) swap(im->tomb, pmc->tomb); swap(im->sources, pmc->sources); for (psf = im->sources; psf; psf = psf->sf_next) - psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + psf->sf_crcount = in_dev->mr_qrv ?: + READ_ONCE(net->ipv4.sysctl_igmp_qrv); } else { - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + im->crcount = in_dev->mr_qrv ?: + READ_ONCE(net->ipv4.sysctl_igmp_qrv); } in_dev_put(pmc->interface); kfree_pmc(pmc); @@ -1349,7 +1351,7 @@ static void igmp_group_added(struct ip_mc_list *im) if (in_dev->dead) return; - im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; + im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); @@ -1363,7 +1365,7 @@ static void igmp_group_added(struct ip_mc_list *im) * IN() to IN(A). */ if (im->sfmode == MCAST_EXCLUDE) - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); igmp_ifc_event(in_dev); #endif @@ -1754,7 +1756,7 @@ static void ip_mc_reset(struct in_device *in_dev) in_dev->mr_qi = IGMP_QUERY_INTERVAL; in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; + in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv); } #else static void ip_mc_reset(struct in_device *in_dev) @@ -1888,7 +1890,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1952,7 +1954,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -2131,7 +2133,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; From f2f316e287e6c2e3a1c5bab8d9b77ee03daa0463 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:45 -0700 Subject: [PATCH 167/248] tcp: Fix data-races around keepalive sysctl knobs. While reading sysctl_tcp_keepalive_(time|probes|intvl), they can be changed concurrently. Thus, we need to add READ_ONCE() to their readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/tcp.h | 9 ++++++--- net/smc/smc_llc.c | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 1636c55e798b..204478d5d388 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1493,21 +1493,24 @@ static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl; + return tp->keepalive_intvl ? : + READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); } static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time; + return tp->keepalive_time ? : + READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes; + return tp->keepalive_probes ? : + READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index c4d057b2941d..0bde36b56472 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -2122,7 +2122,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) init_waitqueue_head(&lgr->llc_flow_waiter); init_waitqueue_head(&lgr->llc_msg_waiter); mutex_init(&lgr->llc_conf_mutex); - lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; + lgr->llc_testlink_time = READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } /* called after lgr was removed from lgr_list */ From 20a3b1c0f603e8c55c3396abd12dfcfb523e4d3c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:46 -0700 Subject: [PATCH 168/248] tcp: Fix data-races around sysctl_tcp_syn(ack)?_retries. While reading sysctl_tcp_syn(ack)?_retries, they can be changed concurrently. Thus, we need to add READ_ONCE() to their readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 3 ++- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_timer.c | 10 +++++++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 2c44556af452..eb31c7158b39 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -833,7 +833,8 @@ static void reqsk_timer_handler(struct timer_list *t) icsk = inet_csk(sk_listener); net = sock_net(sk_listener); - max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; + max_syn_ack_retries = icsk->icsk_syn_retries ? : + READ_ONCE(net->ipv4.sysctl_tcp_synack_retries); /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2222dfdde316..19ce08c9fbdc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3967,7 +3967,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = keepalive_probes(tp); break; case TCP_SYNCNT: - val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : + READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: val = tp->linger2; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4f3b9ab222b6..a234704e8163 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -239,7 +239,8 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) __dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; + retry_until = icsk->icsk_syn_retries ? : + READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); expired = icsk->icsk_retransmits >= retry_until; } else { if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { @@ -406,12 +407,15 @@ abort: tcp_write_err(sk); static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) { struct inet_connection_sock *icsk = inet_csk(sk); - int max_retries = icsk->icsk_syn_retries ? : - sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ struct tcp_sock *tp = tcp_sk(sk); + int max_retries; req->rsk_ops->syn_ack_timeout(req); + /* add one more retry for fastopen */ + max_retries = icsk->icsk_syn_retries ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1; + if (req->num_timeout >= max_retries) { tcp_write_err(sk); return; From f2e383b5bb6bbc60a0b94b87b3e49a2b1aefd11e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:47 -0700 Subject: [PATCH 169/248] tcp: Fix data-races around sysctl_tcp_syncookies. While reading sysctl_tcp_syncookies, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/filter.c | 4 ++-- net/ipv4/syncookies.c | 3 ++- net/ipv4/tcp_input.c | 20 ++++++++++++-------- net/ipv6/syncookies.c | 3 ++- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 2a6a0b0ce43e..7950f7520765 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7041,7 +7041,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; - if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -EINVAL; if (!th->ack || th->rst || th->syn) @@ -7116,7 +7116,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; - if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -ENOENT; if (!th->syn || th->ack || th->fin || th->rst) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b387c4835155..9b234b42021e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -340,7 +340,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct flowi4 fl4; u32 tsoff = 0; - if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) || + !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ec4edc37313..8271eaad887b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6797,11 +6797,14 @@ static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; - bool want_cookie = false; struct net *net = sock_net(sk); + bool want_cookie = false; + u8 syncookies; + + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); #ifdef CONFIG_SYN_COOKIES - if (net->ipv4.sysctl_tcp_syncookies) { + if (syncookies) { msg = "Sending cookies"; want_cookie = true; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); @@ -6809,8 +6812,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - if (!queue->synflood_warned && - net->ipv4.sysctl_tcp_syncookies != 2 && + if (!queue->synflood_warned && syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, sk->sk_num, msg); @@ -6859,7 +6861,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, struct tcp_sock *tp = tcp_sk(sk); u16 mss; - if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 && !inet_csk_reqsk_queue_is_full(sk)) return 0; @@ -6893,13 +6895,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, bool want_cookie = false; struct dst_entry *dst; struct flowi fl; + u8 syncookies; + + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ - if ((net->ipv4.sysctl_tcp_syncookies == 2 || - inet_csk_reqsk_queue_is_full(sk)) && !isn) { + if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); if (!want_cookie) goto drop; @@ -6949,7 +6953,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!want_cookie && !isn) { /* Kill the following clause, if you dislike this way. */ - if (!net->ipv4.sysctl_tcp_syncookies && + if (!syncookies && (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (net->ipv4.sysctl_max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst)) { diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 9cc123f000fb..5014aa663452 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -141,7 +141,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) __u8 rcv_wscale; u32 tsoff = 0; - if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) || + !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) From 4177f545895b1da08447a80692f30617154efa6e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:48 -0700 Subject: [PATCH 170/248] tcp: Fix data-races around sysctl_tcp_migrate_req. While reading sysctl_tcp_migrate_req, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: f9ac779f881c ("net: Introduce net.ipv4.tcp_migrate_req.") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/sock_reuseport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 3f00a28fe762..5daa1fa54249 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -387,7 +387,7 @@ void reuseport_stop_listen_sock(struct sock *sk) prog = rcu_dereference_protected(reuse->prog, lockdep_is_held(&reuseport_lock)); - if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req || + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) || (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { /* Migration capable, move sk from the listening section * to the closed section. @@ -545,7 +545,7 @@ struct sock *reuseport_migrate_sock(struct sock *sk, hash = migrating_sk->sk_hash; prog = rcu_dereference(reuse->prog); if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { - if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req)) goto select_by_hash; goto failure; } From 46778cd16e6a5ad1b2e3a91f6c057c907379418e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:49 -0700 Subject: [PATCH 171/248] tcp: Fix data-races around sysctl_tcp_reordering. While reading sysctl_tcp_reordering, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 10 +++++++--- net/ipv4/tcp_metrics.c | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 19ce08c9fbdc..b3632fa5df07 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -441,7 +441,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; - tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; + tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); tp->tsoffset = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8271eaad887b..de4ccd173c7f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2139,6 +2139,7 @@ void tcp_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; + u8 reordering; tcp_timeout_mark_lost(sk); @@ -2159,10 +2160,12 @@ void tcp_enter_loss(struct sock *sk) /* Timeout in disordered state after receiving substantial DUPACKs * suggests that the degree of reordering is over-estimated. */ + reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering); if (icsk->icsk_ca_state <= TCP_CA_Disorder && - tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) + tp->sacked_out >= reordering) tp->reordering = min_t(unsigned int, tp->reordering, - net->ipv4.sysctl_tcp_reordering); + reordering); + tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); @@ -3464,7 +3467,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ - if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) + if (tcp_sk(sk)->reordering > + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering)) return flag & FLAG_FORWARD_PROGRESS; return flag & FLAG_DATA_ACKED; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 7029b0e98edb..a501150deaa3 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -428,7 +428,8 @@ void tcp_update_metrics(struct sock *sk) if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val < tp->reordering && - tp->reordering != net->ipv4.sysctl_tcp_reordering) + tp->reordering != + READ_ONCE(net->ipv4.sysctl_tcp_reordering)) tcp_metric_set(tm, TCP_METRIC_REORDERING, tp->reordering); } From 39e24435a776e9de5c6dd188836cf2523547804b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:50 -0700 Subject: [PATCH 172/248] tcp: Fix data-races around some timeout sysctl knobs. While reading these sysctl knobs, they can be changed concurrently. Thus, we need to add READ_ONCE() to their readers. - tcp_retries1 - tcp_retries2 - tcp_orphan_retries - tcp_fin_timeout Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_timer.c | 10 +++++----- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 204478d5d388..23ccaa386746 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1523,7 +1523,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) static inline int tcp_fin_time(const struct sock *sk) { - int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout; + int fin_timeout = tcp_sk(sk)->linger2 ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout); const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b3632fa5df07..b1b1bcbc4f60 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3973,7 +3973,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_LINGER2: val = tp->linger2; if (val >= 0) - val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ; + val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 02ab3a9c6657..3b3552d292a5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4090,7 +4090,7 @@ void tcp_send_probe0(struct sock *sk) icsk->icsk_probes_out++; if (err <= 0) { - if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) + if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; timeout = tcp_probe0_when(sk, TCP_RTO_MAX); } else { diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a234704e8163..ec5277becc6a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -143,7 +143,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) */ static int tcp_orphan_retries(struct sock *sk, bool alive) { - int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ + int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */ /* We know from an ICMP that something is wrong. */ if (sk->sk_err_soft && !alive) @@ -243,14 +243,14 @@ static int tcp_write_timeout(struct sock *sk) READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); expired = icsk->icsk_retransmits >= retry_until; } else { - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { + if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); __dst_negative_advice(sk); } - retry_until = net->ipv4.sysctl_tcp_retries2; + retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = icsk->icsk_rto < TCP_RTO_MAX; @@ -381,7 +381,7 @@ static void tcp_probe_timer(struct sock *sk) msecs_to_jiffies(icsk->icsk_user_timeout)) goto abort; - max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; + max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -589,7 +589,7 @@ out_reset_timer: } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) + if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) __sk_dst_reset(sk); out:; From 55be873695ed8912eb77ff46d1d1cadf028bd0f3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:51 -0700 Subject: [PATCH 173/248] tcp: Fix a data-race around sysctl_tcp_notsent_lowat. While reading sysctl_tcp_notsent_lowat, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: c9bee3b7fdec ("tcp: TCP_NOTSENT_LOWAT socket option") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 23ccaa386746..6ee1fb4fb292 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2027,7 +2027,7 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; + return tp->notsent_lowat ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } bool tcp_stream_memory_free(const struct sock *sk, int wake); From cbfc6495586a3f09f6f07d9fb3c7cafe807e3c55 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:52 -0700 Subject: [PATCH 174/248] tcp: Fix a data-race around sysctl_tcp_tw_reuse. While reading sysctl_tcp_tw_reuse, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index da5a3c44c4fb..d16e6e40f47b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -108,10 +108,10 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { + int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); const struct inet_timewait_sock *tw = inet_twsk(sktw); const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); - int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; if (reuse == 2) { /* Still does not detect *everything* that goes through From 79539f34743d3e14cc1fa6577d326a82cc64d62f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:53 -0700 Subject: [PATCH 175/248] tcp: Fix data-races around sysctl_max_syn_backlog. While reading sysctl_max_syn_backlog, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index de4ccd173c7f..d451248bebec 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6956,10 +6956,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); if (!want_cookie && !isn) { + int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); + /* Kill the following clause, if you dislike this way. */ if (!syncookies && - (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (net->ipv4.sysctl_max_syn_backlog >> 2)) && + (max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, From 5a54213318c43f4009ae158347aa6016e3b9b55a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:54 -0700 Subject: [PATCH 176/248] tcp: Fix data-races around sysctl_tcp_fastopen. While reading sysctl_tcp_fastopen, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 2100c8d2d9db ("net-tcp: Fast Open base") Signed-off-by: Kuniyuki Iwashima Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 6 ++++-- net/ipv4/tcp_fastopen.c | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8baef2f3fc4b..252c8bceaba4 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -217,7 +217,7 @@ int inet_listen(struct socket *sock, int backlog) * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ - tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b1b1bcbc4f60..2faaaaf540ac 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1150,7 +1150,8 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & + TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -3617,7 +3618,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & + TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index fdbcf2a6d08e..0acdb5473850 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -332,7 +332,7 @@ static bool tcp_fastopen_no_cookie(const struct sock *sk, const struct dst_entry *dst, int flag) { - return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) || + return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) || tcp_sk(sk)->fastopen_no_cookie || (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE)); } @@ -347,7 +347,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, const struct dst_entry *dst) { bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; - int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; int ret = 0; From 021266ec640c7a4527e6cd4b7349a512b351de1d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Jul 2022 10:17:55 -0700 Subject: [PATCH 177/248] tcp: Fix data-races around sysctl_tcp_fastopen_blackhole_timeout. While reading sysctl_tcp_fastopen_blackhole_timeout, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: cf1ef3f0719b ("net/tcp_fastopen: Disable active side TFO in certain scenarios") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 0acdb5473850..825b216d11f5 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -489,7 +489,7 @@ void tcp_fastopen_active_disable(struct sock *sk) { struct net *net = sock_net(sk); - if (!sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout)) return; /* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */ @@ -510,7 +510,8 @@ void tcp_fastopen_active_disable(struct sock *sk) */ bool tcp_fastopen_active_should_disable(struct sock *sk) { - unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; + unsigned int tfo_bh_timeout = + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout); unsigned long timeout; int tfo_da_times; int multiplier; From 19b3b13c932fc8d613e50e3e92c1944f9fcc02c7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:30 +0300 Subject: [PATCH 178/248] docs: net: dsa: update probing documentation Since the blamed commit we don't have register_switch_driver() and unregister_switch_driver() anymore. Additionally, the expected dsa_register_switch() and dsa_unregister_switch() calls aren't documented. Update the probing section with the details of how things are currently done. Fixes: 93e86b3bc842 ("net: dsa: Remove legacy probing support") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 76 ++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 11 deletions(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index ed7fa76e7a40..8691a84c7e85 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -503,14 +503,74 @@ per-port PHY specific details: interface connection, MDIO bus location, etc. Driver development ================== -DSA switch drivers need to implement a dsa_switch_ops structure which will +DSA switch drivers need to implement a ``dsa_switch_ops`` structure which will contain the various members described below. -``register_switch_driver()`` registers this dsa_switch_ops in its internal list -of drivers to probe for. ``unregister_switch_driver()`` does the exact opposite. +Probing, registration and device lifetime +----------------------------------------- -Unless requested differently by setting the priv_size member accordingly, DSA -does not allocate any driver private context space. +DSA switches are regular ``device`` structures on buses (be they platform, SPI, +I2C, MDIO or otherwise). The DSA framework is not involved in their probing +with the device core. + +Switch registration from the perspective of a driver means passing a valid +``struct dsa_switch`` pointer to ``dsa_register_switch()``, usually from the +switch driver's probing function. The following members must be valid in the +provided structure: + +- ``ds->dev``: will be used to parse the switch's OF node or platform data. + +- ``ds->num_ports``: will be used to create the port list for this switch, and + to validate the port indices provided in the OF node. + +- ``ds->ops``: a pointer to the ``dsa_switch_ops`` structure holding the DSA + method implementations. + +- ``ds->priv``: backpointer to a driver-private data structure which can be + retrieved in all further DSA method callbacks. + +In addition, the following flags in the ``dsa_switch`` structure may optionally +be configured to obtain driver-specific behavior from the DSA core. Their +behavior when set is documented through comments in ``include/net/dsa.h``. + +- ``ds->vlan_filtering_is_global`` + +- ``ds->needs_standalone_vlan_filtering`` + +- ``ds->configure_vlan_while_not_filtering`` + +- ``ds->untag_bridge_pvid`` + +- ``ds->assisted_learning_on_cpu_port`` + +- ``ds->mtu_enforcement_ingress`` + +- ``ds->fdb_isolation`` + +Internally, DSA keeps an array of switch trees (group of switches) global to +the kernel, and attaches a ``dsa_switch`` structure to a tree on registration. +The tree ID to which the switch is attached is determined by the first u32 +number of the ``dsa,member`` property of the switch's OF node (0 if missing). +The switch ID within the tree is determined by the second u32 number of the +same OF property (0 if missing). Registering multiple switches with the same +switch ID and tree ID is illegal and will cause an error. Using platform data, +a single switch and a single switch tree is permitted. + +In case of a tree with multiple switches, probing takes place asymmetrically. +The first N-1 callers of ``dsa_register_switch()`` only add their ports to the +port list of the tree (``dst->ports``), each port having a backpointer to its +associated switch (``dp->ds``). Then, these switches exit their +``dsa_register_switch()`` call early, because ``dsa_tree_setup_routing_table()`` +has determined that the tree is not yet complete (not all ports referenced by +DSA links are present in the tree's port list). The tree becomes complete when +the last switch calls ``dsa_register_switch()``, and this triggers the effective +continuation of initialization (including the call to ``ds->ops->setup()``) for +all switches within that tree, all as part of the calling context of the last +switch's probe function. + +The opposite of registration takes place when calling ``dsa_unregister_switch()``, +which removes a switch's ports from the port list of the tree. The entire tree +is torn down when the first switch unregisters. Switch configuration -------------------- @@ -518,12 +578,6 @@ Switch configuration - ``tag_protocol``: this is to indicate what kind of tagging protocol is supported, should be a valid value from the ``dsa_tag_protocol`` enum -- ``probe``: probe routine which will be invoked by the DSA platform device upon - registration to test for the presence/absence of a switch device. For MDIO - devices, it is recommended to issue a read towards internal registers using - the switch pseudo-PHY and return whether this is a supported device. For other - buses, return a non-NULL string - - ``setup``: setup function for the switch, this function is responsible for setting up the ``dsa_switch_ops`` private structure with all it needs: register maps, interrupts, mutexes, locks, etc. This function is also expected to properly From 54367831c5d0ce273d82814f5fcb35c004f6a912 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:31 +0300 Subject: [PATCH 179/248] docs: net: dsa: document the shutdown behavior Document the changes that took place in the DSA core in the blamed commit. Fixes: 0650bf52b31f ("net: dsa: be compatible with masters which unregister on shutdown") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index 8691a84c7e85..c04cb4c95b64 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -572,6 +572,24 @@ The opposite of registration takes place when calling ``dsa_unregister_switch()` which removes a switch's ports from the port list of the tree. The entire tree is torn down when the first switch unregisters. +It is mandatory for DSA switch drivers to implement the ``shutdown()`` callback +of their respective bus, and call ``dsa_switch_shutdown()`` from it (a minimal +version of the full teardown performed by ``dsa_unregister_switch()``). +The reason is that DSA keeps a reference on the master net device, and if the +driver for the master device decides to unbind on shutdown, DSA's reference +will block that operation from finalizing. + +Either ``dsa_switch_shutdown()`` or ``dsa_unregister_switch()`` must be called, +but not both, and the device driver model permits the bus' ``remove()`` method +to be called even if ``shutdown()`` was already called. Therefore, drivers are +expected to implement a mutual exclusion method between ``remove()`` and +``shutdown()`` by setting their drvdata to NULL after any of these has run, and +checking whether the drvdata is NULL before proceeding to take any action. + +After ``dsa_switch_shutdown()`` or ``dsa_unregister_switch()`` was called, no +further callbacks via the provided ``dsa_switch_ops`` may take place, and the +driver may free the data structures associated with the ``dsa_switch``. + Switch configuration -------------------- From c3f0e84d10862b2b2ed927561f12fe0bf8033590 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:32 +0300 Subject: [PATCH 180/248] docs: net: dsa: rename tag_protocol to get_tag_protocol Since the blamed commit, the enum was turned into a function pointer and also renamed. Update the documentation. Fixes: 7b314362a234 ("net: dsa: Allow the DSA driver to indicate the tag protocol") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index c04cb4c95b64..f49996e97363 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -593,8 +593,8 @@ driver may free the data structures associated with the ``dsa_switch``. Switch configuration -------------------- -- ``tag_protocol``: this is to indicate what kind of tagging protocol is supported, - should be a valid value from the ``dsa_tag_protocol`` enum +- ``get_tag_protocol``: this is to indicate what kind of tagging protocol is + supported, should be a valid value from the ``dsa_tag_protocol`` enum - ``setup``: setup function for the switch, this function is responsible for setting up the ``dsa_switch_ops`` private structure with all it needs: register maps, From c56313a42aaa0c353af6425aed63719823ccfc32 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:33 +0300 Subject: [PATCH 181/248] docs: net: dsa: add more info about the other arguments to get_tag_protocol Changes were made to the prototype of get_tag_protocol without describing at a high level what they are about. Update the documentation to explain that. Fixes: 5ed4e3eb0217 ("net: dsa: Pass a port to get_tag_protocol()") Fixes: 4d776482ecc6 ("net: dsa: Get information about stacked DSA protocol") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index f49996e97363..76b0bc1abbae 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -594,7 +594,11 @@ Switch configuration -------------------- - ``get_tag_protocol``: this is to indicate what kind of tagging protocol is - supported, should be a valid value from the ``dsa_tag_protocol`` enum + supported, should be a valid value from the ``dsa_tag_protocol`` enum. + The returned information does not have to be static; the driver is passed the + CPU port number, as well as the tagging protocol of a possibly stacked + upstream switch, in case there are hardware limitations in terms of supported + tag formats. - ``setup``: setup function for the switch, this function is responsible for setting up the ``dsa_switch_ops`` private structure with all it needs: register maps, From d6a0336addd47af8869953a480e44a63726fad8f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:34 +0300 Subject: [PATCH 182/248] docs: net: dsa: document change_tag_protocol Support for changing the tagging protocol was added without this operation being documented; do so now. Fixes: 53da0ebaad10 ("net: dsa: allow changing the tag protocol via the "tagging" device attribute") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index 76b0bc1abbae..83c1a02376c8 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -600,6 +600,12 @@ Switch configuration upstream switch, in case there are hardware limitations in terms of supported tag formats. +- ``change_tag_protocol``: when the default tagging protocol has compatibility + problems with the master or other issues, the driver may support changing it + at runtime, either through a device tree property or through sysfs. In that + case, further calls to ``get_tag_protocol`` should report the protocol in + current use. + - ``setup``: setup function for the switch, this function is responsible for setting up the ``dsa_switch_ops`` private structure with all it needs: register maps, interrupts, mutexes, locks, etc. This function is also expected to properly From b763f50dc157c2796dded090fac3e05cb5147348 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:35 +0300 Subject: [PATCH 183/248] docs: net: dsa: document the teardown method A teardown method was added to dsa_switch_ops without being documented. Do so now. Fixes: 5e3f847a02aa ("net: dsa: Add teardown callback for drivers") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index 83c1a02376c8..e16eb2e5e787 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -617,7 +617,8 @@ Switch configuration fully configured and ready to serve any kind of request. It is recommended to issue a software reset of the switch during this setup function in order to avoid relying on what a previous software agent such as a bootloader/firmware - may have previously configured. + may have previously configured. The method responsible for undoing any + applicable allocations or operations done here is ``teardown``. PHY devices and link management ------------------------------- From 3c87237ecd27fe5534f3324a4dccbce059c04e40 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:36 +0300 Subject: [PATCH 184/248] docs: net: dsa: document port_setup and port_teardown These methods were added without being documented, fix that. Fixes: fd292c189a97 ("net: dsa: tear down devlink port regions when tearing down the devlink port on error") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index e16eb2e5e787..eade80ed226b 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -620,6 +620,15 @@ Switch configuration may have previously configured. The method responsible for undoing any applicable allocations or operations done here is ``teardown``. +- ``port_setup`` and ``port_teardown``: methods for initialization and + destruction of per-port data structures. It is mandatory for some operations + such as registering and unregistering devlink port regions to be done from + these methods, otherwise they are optional. A port will be torn down only if + it has been previously set up. It is possible for a port to be set up during + probing only to be torn down immediately afterwards, for example in case its + PHY cannot be found. In this case, probing of the DSA switch continues + without that particular port. + PHY devices and link management ------------------------------- From 0cb8682ebf5eedbfd71a8b212f23afc1aedfe1ba Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:37 +0300 Subject: [PATCH 185/248] docs: net: dsa: document port_fast_age The provided information about FDB flushing is not really up to date. The DSA core automatically calls port_fast_age() when necessary, and drivers should just implement that rather than hooking it to port_bridge_leave, port_stp_state_set and others. Fixes: 732f794c1baf ("net: dsa: add port fast ageing") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index eade80ed226b..d83e61958e88 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -738,15 +738,11 @@ Bridge layer - ``port_bridge_leave``: bridge layer function invoked when a given switch port is removed from a bridge, this function should do what's necessary at the switch level to deny the leaving port from ingress/egress traffic from the - remaining bridge members. When the port leaves the bridge, it should be aged - out at the switch hardware for the switch to (re) learn MAC addresses behind - this port. + remaining bridge members. - ``port_stp_state_set``: bridge layer function invoked when a given switch port STP state is computed by the bridge layer and should be propagated to switch - hardware to forward/block/learn traffic. The switch driver is responsible for - computing a STP state change based on current and asked parameters and perform - the relevant ageing based on the intersection results + hardware to forward/block/learn traffic. - ``port_bridge_flags``: bridge layer function invoked when a port must configure its settings for e.g. flooding of unknown traffic or source address @@ -775,6 +771,12 @@ Bridge layer - ``port_bridge_tx_fwd_unoffload``: bridge layer function invoked when a driver leaves a bridge port which had the TX forwarding offload feature enabled. +- ``port_fast_age``: bridge layer function invoked when flushing the + dynamically learned FDB entries on the port is necessary. This is called when + transitioning from an STP state where learning should take place to an STP + state where it shouldn't, or when leaving a bridge, or when address learning + is turned off via ``port_bridge_flags``. + Bridge VLAN filtering --------------------- From 308362394850b680ef3e2cd548bfaa27fd120a4d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:38 +0300 Subject: [PATCH 186/248] docs: net: dsa: remove port_bridge_tx_fwd_offload We've changed the API through which we can offload the bridge TX forwarding process. Update the documentation in light of the removal of 2 DSA switch ops. Fixes: b079922ba2ac ("net: dsa: add a "tx_fwd_offload" argument to ->port_bridge_join") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 62 +++++++++++++++++++++------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index d83e61958e88..75346a8bab62 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -730,10 +730,56 @@ Power management Bridge layer ------------ +Offloading the bridge forwarding plane is optional and handled by the methods +below. They may be absent, return -EOPNOTSUPP, or ``ds->max_num_bridges`` may +be non-zero and exceeded, and in this case, joining a bridge port is still +possible, but the packet forwarding will take place in software, and the ports +under a software bridge must remain configured in the same way as for +standalone operation, i.e. have all bridging service functions (address +learning etc) disabled, and send all received packets to the CPU port only. + +Concretely, a port starts offloading the forwarding plane of a bridge once it +returns success to the ``port_bridge_join`` method, and stops doing so after +``port_bridge_leave`` has been called. Offloading the bridge means autonomously +learning FDB entries in accordance with the software bridge port's state, and +autonomously forwarding (or flooding) received packets without CPU intervention. +This is optional even when offloading a bridge port. Tagging protocol drivers +are expected to call ``dsa_default_offload_fwd_mark(skb)`` for packets which +have already been autonomously forwarded in the forwarding domain of the +ingress switch port. DSA, through ``dsa_port_devlink_setup()``, considers all +switch ports part of the same tree ID to be part of the same bridge forwarding +domain (capable of autonomous forwarding to each other). + +Offloading the TX forwarding process of a bridge is a distinct concept from +simply offloading its forwarding plane, and refers to the ability of certain +driver and tag protocol combinations to transmit a single skb coming from the +bridge device's transmit function to potentially multiple egress ports (and +thereby avoid its cloning in software). + +Packets for which the bridge requests this behavior are called data plane +packets and have ``skb->offload_fwd_mark`` set to true in the tag protocol +driver's ``xmit`` function. Data plane packets are subject to FDB lookup, +hardware learning on the CPU port, and do not override the port STP state. +Additionally, replication of data plane packets (multicast, flooding) is +handled in hardware and the bridge driver will transmit a single skb for each +packet that may or may not need replication. + +When the TX forwarding offload is enabled, the tag protocol driver is +responsible to inject packets into the data plane of the hardware towards the +correct bridging domain (FID) that the port is a part of. The port may be +VLAN-unaware, and in this case the FID must be equal to the FID used by the +driver for its VLAN-unaware address database associated with that bridge. +Alternatively, the bridge may be VLAN-aware, and in that case, it is guaranteed +that the packet is also VLAN-tagged with the VLAN ID that the bridge processed +this packet in. It is the responsibility of the hardware to untag the VID on +the egress-untagged ports, or keep the tag on the egress-tagged ones. + - ``port_bridge_join``: bridge layer function invoked when a given switch port is added to a bridge, this function should do what's necessary at the switch level to permit the joining port to be added to the relevant logical domain for it to ingress/egress traffic with other members of the bridge. + By setting the ``tx_fwd_offload`` argument to true, the TX forwarding process + of this bridge is also offloaded. - ``port_bridge_leave``: bridge layer function invoked when a given switch port is removed from a bridge, this function should do what's necessary at the @@ -755,22 +801,6 @@ Bridge layer CPU port, and flooding towards the CPU port should also be enabled, due to a lack of an explicit address filtering mechanism in the DSA core. -- ``port_bridge_tx_fwd_offload``: bridge layer function invoked after - ``port_bridge_join`` when a driver sets ``ds->num_fwd_offloading_bridges`` to - a non-zero value. Returning success in this function activates the TX - forwarding offload bridge feature for this port, which enables the tagging - protocol driver to inject data plane packets towards the bridging domain that - the port is a part of. Data plane packets are subject to FDB lookup, hardware - learning on the CPU port, and do not override the port STP state. - Additionally, replication of data plane packets (multicast, flooding) is - handled in hardware and the bridge driver will transmit a single skb for each - packet that needs replication. The method is provided as a configuration - point for drivers that need to configure the hardware for enabling this - feature. - -- ``port_bridge_tx_fwd_unoffload``: bridge layer function invoked when a driver - leaves a bridge port which had the TX forwarding offload feature enabled. - - ``port_fast_age``: bridge layer function invoked when flushing the dynamically learned FDB entries on the port is necessary. This is called when transitioning from an STP state where learning should take place to an STP From e465d507c76ce2552e1e08513f1d1ca8c4175e9c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:39 +0300 Subject: [PATCH 187/248] docs: net: dsa: remove port_vlan_dump This was deleted in 2017, delete the obsolete documentation. Fixes: c069fcd82c57 ("net: dsa: Remove support for bypass bridge port attributes/vlan set") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index 75346a8bab62..e61eef93be1b 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -828,10 +828,6 @@ Bridge VLAN filtering - ``port_vlan_del``: bridge layer function invoked when a VLAN is removed from the given switch port -- ``port_vlan_dump``: bridge layer function invoked with a switchdev callback - function that the driver has to call for each VLAN the given port is a member - of. A switchdev object is used to carry the VID and bridge flags. - - ``port_fdb_add``: bridge layer function invoked when the bridge wants to install a Forwarding Database entry, the switch hardware should be programmed with the specified address in the specified VLAN Id in the forwarding database From 7f75d3dd4f5b00a1d3ef853f044a25b4cb55082a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:40 +0300 Subject: [PATCH 188/248] docs: net: dsa: delete port_mdb_dump This was deleted in 2017, stop documenting it. Fixes: dc0cbff3ff9f ("net: dsa: Remove redundant MDB dump support") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index e61eef93be1b..118853d1d7ac 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -862,10 +862,6 @@ Bridge VLAN filtering the specified MAC address from the specified VLAN ID if it was mapped into this port forwarding database. -- ``port_mdb_dump``: bridge layer function invoked with a switchdev callback - function that the driver has to call for each MAC address known to be behind - the given port. A switchdev object is used to carry the VID and MDB info. - Link aggregation ---------------- From 4e9d9bb6df6b4ef87f217e81a8eb37c359400e2e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:41 +0300 Subject: [PATCH 189/248] docs: net: dsa: add a section for address databases The given definition for what VID 0 represents in the current port_fdb_add and port_mdb_add is blatantly wrong. Delete it and explain the concepts surrounding DSA's understanding of FDB isolation. Fixes: c26933639b54 ("net: dsa: request drivers to perform FDB isolation") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 136 +++++++++++++++++++++++++-- 1 file changed, 130 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index 118853d1d7ac..c8bd246d4010 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -727,6 +727,136 @@ Power management ``BR_STATE_DISABLED`` and propagating changes to the hardware if this port is disabled while being a bridge member +Address databases +----------------- + +Switching hardware is expected to have a table for FDB entries, however not all +of them are active at the same time. An address database is the subset (partition) +of FDB entries that is active (can be matched by address learning on RX, or FDB +lookup on TX) depending on the state of the port. An address database may +occasionally be called "FID" (Filtering ID) in this document, although the +underlying implementation may choose whatever is available to the hardware. + +For example, all ports that belong to a VLAN-unaware bridge (which is +*currently* VLAN-unaware) are expected to learn source addresses in the +database associated by the driver with that bridge (and not with other +VLAN-unaware bridges). During forwarding and FDB lookup, a packet received on a +VLAN-unaware bridge port should be able to find a VLAN-unaware FDB entry having +the same MAC DA as the packet, which is present on another port member of the +same bridge. At the same time, the FDB lookup process must be able to not find +an FDB entry having the same MAC DA as the packet, if that entry points towards +a port which is a member of a different VLAN-unaware bridge (and is therefore +associated with a different address database). + +Similarly, each VLAN of each offloaded VLAN-aware bridge should have an +associated address database, which is shared by all ports which are members of +that VLAN, but not shared by ports belonging to different bridges that are +members of the same VID. + +In this context, a VLAN-unaware database means that all packets are expected to +match on it irrespective of VLAN ID (only MAC address lookup), whereas a +VLAN-aware database means that packets are supposed to match based on the VLAN +ID from the classified 802.1Q header (or the pvid if untagged). + +At the bridge layer, VLAN-unaware FDB entries have the special VID value of 0, +whereas VLAN-aware FDB entries have non-zero VID values. Note that a +VLAN-unaware bridge may have VLAN-aware (non-zero VID) FDB entries, and a +VLAN-aware bridge may have VLAN-unaware FDB entries. As in hardware, the +software bridge keeps separate address databases, and offloads to hardware the +FDB entries belonging to these databases, through switchdev, asynchronously +relative to the moment when the databases become active or inactive. + +When a user port operates in standalone mode, its driver should configure it to +use a separate database called a port private database. This is different from +the databases described above, and should impede operation as standalone port +(packet in, packet out to the CPU port) as little as possible. For example, +on ingress, it should not attempt to learn the MAC SA of ingress traffic, since +learning is a bridging layer service and this is a standalone port, therefore +it would consume useless space. With no address learning, the port private +database should be empty in a naive implementation, and in this case, all +received packets should be trivially flooded to the CPU port. + +DSA (cascade) and CPU ports are also called "shared" ports because they service +multiple address databases, and the database that a packet should be associated +to is usually embedded in the DSA tag. This means that the CPU port may +simultaneously transport packets coming from a standalone port (which were +classified by hardware in one address database), and from a bridge port (which +were classified to a different address database). + +Switch drivers which satisfy certain criteria are able to optimize the naive +configuration by removing the CPU port from the flooding domain of the switch, +and just program the hardware with FDB entries pointing towards the CPU port +for which it is known that software is interested in those MAC addresses. +Packets which do not match a known FDB entry will not be delivered to the CPU, +which will save CPU cycles required for creating an skb just to drop it. + +DSA is able to perform host address filtering for the following kinds of +addresses: + +- Primary unicast MAC addresses of ports (``dev->dev_addr``). These are + associated with the port private database of the respective user port, + and the driver is notified to install them through ``port_fdb_add`` towards + the CPU port. + +- Secondary unicast and multicast MAC addresses of ports (addresses added + through ``dev_uc_add()`` and ``dev_mc_add()``). These are also associated + with the port private database of the respective user port. + +- Local/permanent bridge FDB entries (``BR_FDB_LOCAL``). These are the MAC + addresses of the bridge ports, for which packets must be terminated locally + and not forwarded. They are associated with the address database for that + bridge. + +- Static bridge FDB entries installed towards foreign (non-DSA) interfaces + present in the same bridge as some DSA switch ports. These are also + associated with the address database for that bridge. + +- Dynamically learned FDB entries on foreign interfaces present in the same + bridge as some DSA switch ports, only if ``ds->assisted_learning_on_cpu_port`` + is set to true by the driver. These are associated with the address database + for that bridge. + +For various operations detailed below, DSA provides a ``dsa_db`` structure +which can be of the following types: + +- ``DSA_DB_PORT``: the FDB (or MDB) entry to be installed or deleted belongs to + the port private database of user port ``db->dp``. +- ``DSA_DB_BRIDGE``: the entry belongs to one of the address databases of bridge + ``db->bridge``. Separation between the VLAN-unaware database and the per-VID + databases of this bridge is expected to be done by the driver. +- ``DSA_DB_LAG``: the entry belongs to the address database of LAG ``db->lag``. + Note: ``DSA_DB_LAG`` is currently unused and may be removed in the future. + +The drivers which act upon the ``dsa_db`` argument in ``port_fdb_add``, +``port_mdb_add`` etc should declare ``ds->fdb_isolation`` as true. + +DSA associates each offloaded bridge and each offloaded LAG with a one-based ID +(``struct dsa_bridge :: num``, ``struct dsa_lag :: id``) for the purposes of +refcounting addresses on shared ports. Drivers may piggyback on DSA's numbering +scheme (the ID is readable through ``db->bridge.num`` and ``db->lag.id`` or may +implement their own. + +Only the drivers which declare support for FDB isolation are notified of FDB +entries on the CPU port belonging to ``DSA_DB_PORT`` databases. +For compatibility/legacy reasons, ``DSA_DB_BRIDGE`` addresses are notified to +drivers even if they do not support FDB isolation. However, ``db->bridge.num`` +and ``db->lag.id`` are always set to 0 in that case (to denote the lack of +isolation, for refcounting purposes). + +Note that it is not mandatory for a switch driver to implement physically +separate address databases for each standalone user port. Since FDB entries in +the port private databases will always point to the CPU port, there is no risk +for incorrect forwarding decisions. In this case, all standalone ports may +share the same database, but the reference counting of host-filtered addresses +(not deleting the FDB entry for a port's MAC address if it's still in use by +another port) becomes the responsibility of the driver, because DSA is unaware +that the port databases are in fact shared. This can be achieved by calling +``dsa_fdb_present_in_other_db()`` and ``dsa_mdb_present_in_other_db()``. +The down side is that the RX filtering lists of each user port are in fact +shared, which means that user port A may accept a packet with a MAC DA it +shouldn't have, only because that MAC address was in the RX filtering list of +user port B. These packets will still be dropped in software, however. + Bridge layer ------------ @@ -835,9 +965,6 @@ Bridge VLAN filtering function should return ``-EOPNOTSUPP`` to inform the bridge code to fallback to a software implementation. -.. note:: VLAN ID 0 corresponds to the port private database, which, in the context - of DSA, would be its port-based VLAN, used by the associated bridge device. - - ``port_fdb_del``: bridge layer function invoked when the bridge wants to remove a Forwarding Database entry, the switch hardware should be programmed to delete the specified MAC address from the specified VLAN ID if it was mapped into @@ -854,9 +981,6 @@ Bridge VLAN filtering specified address in the specified VLAN ID in the forwarding database associated with this VLAN ID. -.. note:: VLAN ID 0 corresponds to the port private database, which, in the context - of DSA, would be its port-based VLAN, used by the associated bridge device. - - ``port_mdb_del``: bridge layer function invoked when the bridge wants to remove a multicast database entry, the switch hardware should be programmed to delete the specified MAC address from the specified VLAN ID if it was mapped into From ea7006a7aaee54a8861e0bfd5cf6a8495fb998a7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:42 +0300 Subject: [PATCH 190/248] docs: net: dsa: re-explain what port_fdb_dump actually does Switchdev has changed radically from its initial implementation, and the currently provided definition is incorrect and very confusing. Rewrite it in light of what it actually does. Fixes: 2bedde1abbef ("net: dsa: Move FDB dump implementation inside DSA") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index c8bd246d4010..330a76c2fab6 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -970,9 +970,12 @@ Bridge VLAN filtering the specified MAC address from the specified VLAN ID if it was mapped into this port forwarding database -- ``port_fdb_dump``: bridge layer function invoked with a switchdev callback - function that the driver has to call for each MAC address known to be behind - the given port. A switchdev object is used to carry the VID and FDB info. +- ``port_fdb_dump``: bridge bypass function invoked by ``ndo_fdb_dump`` on the + physical DSA port interfaces. Since DSA does not attempt to keep in sync its + hardware FDB entries with the software bridge, this method is implemented as + a means to view the entries visible on user ports in the hardware database. + The entries reported by this function have the ``self`` flag in the output of + the ``bridge fdb show`` command. - ``port_mdb_add``: bridge layer function invoked when the bridge wants to install a multicast database entry. If the operation is not supported, this function From 6ba1a4aa5974f8a47e6322cecc965e6357b58d80 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:43 +0300 Subject: [PATCH 191/248] docs: net: dsa: delete misinformation about -EOPNOTSUPP for FDB/MDB/VLAN Returning -EOPNOTSUPP does *NOT* mean anything special. port_vlan_add() is actually called from 2 code paths, one is vlan_vid_add() from 8021q module and the other is br_switchdev_port_vlan_add() from switchdev. The bridge has a wrapper __vlan_vid_add() which first tries via switchdev, then if that returns -EOPNOTSUPP, tries again via the VLAN RX filters in the 8021q module. But DSA doesn't distinguish between one call path and the other when calling the driver's port_vlan_add(), so if the driver returns -EOPNOTSUPP to switchdev, it also returns -EOPNOTSUPP to the 8021q module. And the latter is a hard error. port_fdb_add() is called from the deferred dsa_owq only, so obviously its return code isn't propagated anywhere, and cannot be interpreted in any way. The return code from port_mdb_add() is propagated to the bridge, but again, this doesn't do anything special when -EOPNOTSUPP is returned, but rather, br_switchdev_mdb_notify() returns void. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index 330a76c2fab6..69ea35e19755 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -951,9 +951,7 @@ Bridge VLAN filtering allowed. - ``port_vlan_add``: bridge layer function invoked when a VLAN is configured - (tagged or untagged) for the given switch port. If the operation is not - supported by the hardware, this function should return ``-EOPNOTSUPP`` to - inform the bridge code to fallback to a software implementation. + (tagged or untagged) for the given switch port.. - ``port_vlan_del``: bridge layer function invoked when a VLAN is removed from the given switch port @@ -961,9 +959,7 @@ Bridge VLAN filtering - ``port_fdb_add``: bridge layer function invoked when the bridge wants to install a Forwarding Database entry, the switch hardware should be programmed with the specified address in the specified VLAN Id in the forwarding database - associated with this VLAN ID. If the operation is not supported, this - function should return ``-EOPNOTSUPP`` to inform the bridge code to fallback to - a software implementation. + associated with this VLAN ID. - ``port_fdb_del``: bridge layer function invoked when the bridge wants to remove a Forwarding Database entry, the switch hardware should be programmed to delete @@ -978,9 +974,7 @@ Bridge VLAN filtering the ``bridge fdb show`` command. - ``port_mdb_add``: bridge layer function invoked when the bridge wants to install - a multicast database entry. If the operation is not supported, this function - should return ``-EOPNOTSUPP`` to inform the bridge code to fallback to a - software implementation. The switch hardware should be programmed with the + a multicast database entry. The switch hardware should be programmed with the specified address in the specified VLAN ID in the forwarding database associated with this VLAN ID. From 7b02f40350f1b8011f724a052dcb0849cffa6c38 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 16 Jul 2022 21:53:44 +0300 Subject: [PATCH 192/248] docs: net: dsa: mention that VLANs are now refcounted on shared ports The blamed commit updated the way in which VLANs are handled at the cross-chip notifier layer and didn't update the documentation to say that. Fix it. Fixes: 134ef2388e7f ("net: dsa: add explicit support for host bridge VLANs") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index 69ea35e19755..d742ba6bd211 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -951,7 +951,13 @@ Bridge VLAN filtering allowed. - ``port_vlan_add``: bridge layer function invoked when a VLAN is configured - (tagged or untagged) for the given switch port.. + (tagged or untagged) for the given switch port. The CPU port becomes a member + of a VLAN only if a foreign bridge port is also a member of it (and + forwarding needs to take place in software), or the VLAN is installed to the + VLAN group of the bridge device itself, for termination purposes + (``bridge vlan add dev br0 vid 100 self``). VLANs on shared ports are + reference counted and removed when there is no user left. Drivers do not need + to manually install a VLAN on the CPU port. - ``port_vlan_del``: bridge layer function invoked when a VLAN is removed from the given switch port From 968996c070ef080ee7d6150faa98a4e562ce4625 Mon Sep 17 00:00:00 2001 From: Przemyslaw Patynowski Date: Fri, 10 Jun 2022 14:15:54 +0200 Subject: [PATCH 193/248] iavf: Fix VLAN_V2 addition/rejection Fix VLAN addition, so that PF driver does not reject whole VLAN batch. Add VLAN reject handling, so rejected VLANs, won't litter VLAN filter list. Fix handling of active_(c/s)vlans, so it will be possible to re-add VLAN filters for user. Without this patch, after changing trust to off, with VLAN filters saturated, no VLAN is added, due to PF rejecting addition. Fixes: 92fc50859872 ("iavf: Restrict maximum VLAN filters for VIRTCHNL_VF_OFFLOAD_VLAN_V2") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Jedrzej Jagielski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 9 ++- drivers/net/ethernet/intel/iavf/iavf_main.c | 10 ++- .../net/ethernet/intel/iavf/iavf_virtchnl.c | 65 ++++++++++++++++++- 3 files changed, 74 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 49aed3e506a6..86bc61c300a7 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -159,8 +159,12 @@ struct iavf_vlan { struct iavf_vlan_filter { struct list_head list; struct iavf_vlan vlan; - bool remove; /* filter needs to be removed */ - bool add; /* filter needs to be added */ + struct { + u8 is_new_vlan:1; /* filter is new, wait for PF answer */ + u8 remove:1; /* filter needs to be removed */ + u8 add:1; /* filter needs to be added */ + u8 padding:5; + }; }; #define IAVF_MAX_TRAFFIC_CLASS 4 @@ -520,6 +524,7 @@ int iavf_get_vf_config(struct iavf_adapter *adapter); int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter); int iavf_send_vf_offload_vlan_v2_msg(struct iavf_adapter *adapter); void iavf_set_queue_vlan_tag_loc(struct iavf_adapter *adapter); +u16 iavf_get_num_vlans_added(struct iavf_adapter *adapter); void iavf_irq_enable(struct iavf_adapter *adapter, bool flush); void iavf_configure_queues(struct iavf_adapter *adapter); void iavf_deconfigure_queues(struct iavf_adapter *adapter); diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index f3ecb3bca33d..2a8643e66331 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -843,7 +843,7 @@ static void iavf_restore_filters(struct iavf_adapter *adapter) * iavf_get_num_vlans_added - get number of VLANs added * @adapter: board private structure */ -static u16 iavf_get_num_vlans_added(struct iavf_adapter *adapter) +u16 iavf_get_num_vlans_added(struct iavf_adapter *adapter) { return bitmap_weight(adapter->vsi.active_cvlans, VLAN_N_VID) + bitmap_weight(adapter->vsi.active_svlans, VLAN_N_VID); @@ -906,11 +906,6 @@ static int iavf_vlan_rx_add_vid(struct net_device *netdev, if (!iavf_add_vlan(adapter, IAVF_VLAN(vid, be16_to_cpu(proto)))) return -ENOMEM; - if (proto == cpu_to_be16(ETH_P_8021Q)) - set_bit(vid, adapter->vsi.active_cvlans); - else - set_bit(vid, adapter->vsi.active_svlans); - return 0; } @@ -2956,6 +2951,9 @@ continue_reset: adapter->aq_required |= IAVF_FLAG_AQ_ADD_CLOUD_FILTER; iavf_misc_irq_enable(adapter); + bitmap_clear(adapter->vsi.active_cvlans, 0, VLAN_N_VID); + bitmap_clear(adapter->vsi.active_svlans, 0, VLAN_N_VID); + mod_delayed_work(iavf_wq, &adapter->watchdog_task, 2); /* We were running when the reset started, so we need to restore some diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 782450d5c12f..1603e99bae4a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -626,6 +626,33 @@ static void iavf_mac_add_reject(struct iavf_adapter *adapter) spin_unlock_bh(&adapter->mac_vlan_list_lock); } +/** + * iavf_vlan_add_reject + * @adapter: adapter structure + * + * Remove VLAN filters from list based on PF response. + **/ +static void iavf_vlan_add_reject(struct iavf_adapter *adapter) +{ + struct iavf_vlan_filter *f, *ftmp; + + spin_lock_bh(&adapter->mac_vlan_list_lock); + list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { + if (f->is_new_vlan) { + if (f->vlan.tpid == ETH_P_8021Q) + clear_bit(f->vlan.vid, + adapter->vsi.active_cvlans); + else + clear_bit(f->vlan.vid, + adapter->vsi.active_svlans); + + list_del(&f->list); + kfree(f); + } + } + spin_unlock_bh(&adapter->mac_vlan_list_lock); +} + /** * iavf_add_vlans * @adapter: adapter structure @@ -683,6 +710,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) vvfl->vlan_id[i] = f->vlan.vid; i++; f->add = false; + f->is_new_vlan = true; if (i == count) break; } @@ -695,10 +723,18 @@ void iavf_add_vlans(struct iavf_adapter *adapter) iavf_send_pf_msg(adapter, VIRTCHNL_OP_ADD_VLAN, (u8 *)vvfl, len); kfree(vvfl); } else { + u16 max_vlans = adapter->vlan_v2_caps.filtering.max_filters; + u16 current_vlans = iavf_get_num_vlans_added(adapter); struct virtchnl_vlan_filter_list_v2 *vvfl_v2; adapter->current_op = VIRTCHNL_OP_ADD_VLAN_V2; + if ((count + current_vlans) > max_vlans && + current_vlans < max_vlans) { + count = max_vlans - iavf_get_num_vlans_added(adapter); + more = true; + } + len = sizeof(*vvfl_v2) + ((count - 1) * sizeof(struct virtchnl_vlan_filter)); if (len > IAVF_MAX_AQ_BUF_SIZE) { @@ -725,6 +761,9 @@ void iavf_add_vlans(struct iavf_adapter *adapter) &adapter->vlan_v2_caps.filtering.filtering_support; struct virtchnl_vlan *vlan; + if (i == count) + break; + /* give priority over outer if it's enabled */ if (filtering_support->outer) vlan = &vvfl_v2->filters[i].outer; @@ -736,8 +775,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) i++; f->add = false; - if (i == count) - break; + f->is_new_vlan = true; } } @@ -2080,6 +2118,11 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, */ iavf_netdev_features_vlan_strip_set(netdev, true); break; + case VIRTCHNL_OP_ADD_VLAN_V2: + iavf_vlan_add_reject(adapter); + dev_warn(&adapter->pdev->dev, "Failed to add VLAN filter, error %s\n", + iavf_stat_str(&adapter->hw, v_retval)); + break; default: dev_err(&adapter->pdev->dev, "PF returned error %d (%s) to our request %d\n", v_retval, iavf_stat_str(&adapter->hw, v_retval), @@ -2332,6 +2375,24 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_unlock_bh(&adapter->adv_rss_lock); } break; + case VIRTCHNL_OP_ADD_VLAN_V2: { + struct iavf_vlan_filter *f; + + spin_lock_bh(&adapter->mac_vlan_list_lock); + list_for_each_entry(f, &adapter->vlan_filter_list, list) { + if (f->is_new_vlan) { + f->is_new_vlan = false; + if (f->vlan.tpid == ETH_P_8021Q) + set_bit(f->vlan.vid, + adapter->vsi.active_cvlans); + else + set_bit(f->vlan.vid, + adapter->vsi.active_svlans); + } + } + spin_unlock_bh(&adapter->mac_vlan_list_lock); + } + break; case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING: /* PF enabled vlan strip on this VF. * Update netdev->features if needed to be in sync with ethtool. From 4635fd3a9d77581498f34ab9a7e4bcc211bf0a4c Mon Sep 17 00:00:00 2001 From: Przemyslaw Patynowski Date: Mon, 13 Jun 2022 19:07:42 -0400 Subject: [PATCH 194/248] iavf: Disallow changing rx/tx-frames and rx/tx-frames-irq Remove from supported_coalesce_params ETHTOOL_COALESCE_MAX_FRAMES and ETHTOOL_COALESCE_MAX_FRAMES_IRQ. As tx-frames-irq allowed user to change budget for iavf_clean_tx_irq, remove work_limit and use define for budget. Without this patch there would be possibility to change rx/tx-frames and rx/tx-frames-irq, which for rx/tx-frames did nothing, while for rx/tx-frames-irq it changed rx/tx-frames and only changed budget for cleaning NAPI poll. Fixes: fbb7ddfef253 ("i40evf: core ethtool functionality") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Jun Zhang Tested-by: Marek Szlosek Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 1 - drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 10 ---------- drivers/net/ethernet/intel/iavf/iavf_main.c | 1 - drivers/net/ethernet/intel/iavf/iavf_txrx.c | 2 +- 4 files changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 86bc61c300a7..2a7b3c085aa9 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -64,7 +64,6 @@ struct iavf_vsi { u16 id; DECLARE_BITMAP(state, __IAVF_VSI_STATE_SIZE__); int base_vector; - u16 work_limit; u16 qs_handle; void *priv; /* client driver data reference. */ }; diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 3bb56714beb0..e535d4c3da49 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -692,12 +692,8 @@ static int __iavf_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec, int queue) { struct iavf_adapter *adapter = netdev_priv(netdev); - struct iavf_vsi *vsi = &adapter->vsi; struct iavf_ring *rx_ring, *tx_ring; - ec->tx_max_coalesced_frames = vsi->work_limit; - ec->rx_max_coalesced_frames = vsi->work_limit; - /* Rx and Tx usecs per queue value. If user doesn't specify the * queue, return queue 0's value to represent. */ @@ -825,12 +821,8 @@ static int __iavf_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec, int queue) { struct iavf_adapter *adapter = netdev_priv(netdev); - struct iavf_vsi *vsi = &adapter->vsi; int i; - if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq) - vsi->work_limit = ec->tx_max_coalesced_frames_irq; - if (ec->rx_coalesce_usecs == 0) { if (ec->use_adaptive_rx_coalesce) netif_info(adapter, drv, netdev, "rx-usecs=0, need to disable adaptive-rx for a complete disable\n"); @@ -1969,8 +1961,6 @@ static int iavf_set_rxfh(struct net_device *netdev, const u32 *indir, static const struct ethtool_ops iavf_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_USECS | - ETHTOOL_COALESCE_MAX_FRAMES | - ETHTOOL_COALESCE_MAX_FRAMES_IRQ | ETHTOOL_COALESCE_USE_ADAPTIVE, .get_drvinfo = iavf_get_drvinfo, .get_link = ethtool_op_get_link, diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 2a8643e66331..2e2c153ce46a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2240,7 +2240,6 @@ int iavf_parse_vf_resource_msg(struct iavf_adapter *adapter) adapter->vsi.back = adapter; adapter->vsi.base_vector = 1; - adapter->vsi.work_limit = IAVF_DEFAULT_IRQ_WORK; vsi->netdev = adapter->netdev; vsi->qs_handle = adapter->vsi_res->qset_handle; if (adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_RSS_PF) { diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c index 978f651c6b09..7bf8c25dc824 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c @@ -194,7 +194,7 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi, struct iavf_tx_buffer *tx_buf; struct iavf_tx_desc *tx_desc; unsigned int total_bytes = 0, total_packets = 0; - unsigned int budget = vsi->work_limit; + unsigned int budget = IAVF_DEFAULT_IRQ_WORK; tx_buf = &tx_ring->tx_bi[i]; tx_desc = IAVF_TX_DESC(tx_ring, i); From a9f49e0060301a9bfebeca76739158d0cf91cdf6 Mon Sep 17 00:00:00 2001 From: Przemyslaw Patynowski Date: Fri, 24 Jun 2022 17:33:01 -0700 Subject: [PATCH 195/248] iavf: Fix handling of dummy receive descriptors Fix memory leak caused by not handling dummy receive descriptor properly. iavf_get_rx_buffer now sets the rx_buffer return value for dummy receive descriptors. Without this patch, when the hardware writes a dummy descriptor, iavf would not free the page allocated for the previous receive buffer. This is an unlikely event but can still happen. [Jesse: massaged commit message] Fixes: efa14c398582 ("iavf: allow null RX descriptors") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Jesse Brandeburg Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_txrx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c index 7bf8c25dc824..06d18797d25a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c @@ -1285,11 +1285,10 @@ static struct iavf_rx_buffer *iavf_get_rx_buffer(struct iavf_ring *rx_ring, { struct iavf_rx_buffer *rx_buffer; - if (!size) - return NULL; - rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean]; prefetchw(rx_buffer->page); + if (!size) + return rx_buffer; /* we are reusing so sync this buffer for CPU use */ dma_sync_single_range_for_cpu(rx_ring->dev, From d8fa2fd791a72087c1ce3336fbeefec4057c37c8 Mon Sep 17 00:00:00 2001 From: Przemyslaw Patynowski Date: Wed, 15 Jun 2022 13:57:20 -0400 Subject: [PATCH 196/248] iavf: Fix missing state logs Fix debug prints, by adding missing state prints. Extend iavf_state_str by strings for __IAVF_INIT_EXTENDED_CAPS and __IAVF_INIT_CONFIG_ADAPTER. Without this patch, when enabling debug prints for iavf.h, user will see: iavf 0000:06:0e.0: state transition from:__IAVF_INIT_GET_RESOURCES to:__IAVF_UNKNOWN_STATE iavf 0000:06:0e.0: state transition from:__IAVF_UNKNOWN_STATE to:__IAVF_UNKNOWN_STATE Fixes: 605ca7c5c670 ("iavf: Fix kernel BUG in free_msi_irqs") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Jun Zhang Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 2a7b3c085aa9..0ea0361cd86b 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -464,6 +464,10 @@ static inline const char *iavf_state_str(enum iavf_state_t state) return "__IAVF_INIT_VERSION_CHECK"; case __IAVF_INIT_GET_RESOURCES: return "__IAVF_INIT_GET_RESOURCES"; + case __IAVF_INIT_EXTENDED_CAPS: + return "__IAVF_INIT_EXTENDED_CAPS"; + case __IAVF_INIT_CONFIG_ADAPTER: + return "__IAVF_INIT_CONFIG_ADAPTER"; case __IAVF_INIT_SW: return "__IAVF_INIT_SW"; case __IAVF_INIT_FAILED: From 45533a534a45cb12c20c81615d17306176cb1c57 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 14 Jul 2022 21:40:36 +0200 Subject: [PATCH 197/248] net: lan966x: Fix taking rtnl_lock while holding spin_lock When the HW deletes an entry in MAC table then it generates an interrupt. The SW will go through it's own list of MAC entries and if it is not found then it would notify the listeners about this. The problem is that when the SW will go through it's own list it would take a spin lock(lan966x->mac_lock) and when it notifies that the entry is deleted. But to notify the listeners it taking the rtnl_lock which is illegal. This is fixed by instead of notifying right away that the entry is deleted, move the entry on a temp list and once, it checks all the entries then just notify that the entries from temp list are deleted. Fixes: 5ccd66e01cbe ("net: lan966x: add support for interrupts from analyzer") Signed-off-by: Horatiu Vultur Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- .../ethernet/microchip/lan966x/lan966x_mac.c | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c index 005e56ea5da1..2d2b83c03796 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c @@ -325,10 +325,13 @@ static void lan966x_mac_irq_process(struct lan966x *lan966x, u32 row, { struct lan966x_mac_entry *mac_entry, *tmp; unsigned char mac[ETH_ALEN] __aligned(2); + struct list_head mac_deleted_entries; u32 dest_idx; u32 column; u16 vid; + INIT_LIST_HEAD(&mac_deleted_entries); + spin_lock(&lan966x->mac_lock); list_for_each_entry_safe(mac_entry, tmp, &lan966x->mac_entries, list) { bool found = false; @@ -362,20 +365,26 @@ static void lan966x_mac_irq_process(struct lan966x *lan966x, u32 row, } if (!found) { - /* Notify the bridge that the entry doesn't exist - * anymore in the HW and remove the entry from the SW - * list - */ - lan966x_mac_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, - mac_entry->mac, mac_entry->vid, - lan966x->ports[mac_entry->port_index]->dev); - list_del(&mac_entry->list); - kfree(mac_entry); + /* Move the entry from SW list to a tmp list such that + * it would be deleted later + */ + list_add_tail(&mac_entry->list, &mac_deleted_entries); } } spin_unlock(&lan966x->mac_lock); + list_for_each_entry_safe(mac_entry, tmp, &mac_deleted_entries, list) { + /* Notify the bridge that the entry doesn't exist + * anymore in the HW + */ + lan966x_mac_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, + mac_entry->mac, mac_entry->vid, + lan966x->ports[mac_entry->port_index]->dev); + list_del(&mac_entry->list); + kfree(mac_entry); + } + /* Now go to the list of columns and see if any entry was not in the SW * list, then that means that the entry is new so it needs to notify the * bridge. From 43243bb3195b0dc27741679471e23baed1efe98e Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 14 Jul 2022 21:40:37 +0200 Subject: [PATCH 198/248] net: lan966x: Fix usage of lan966x->mac_lock when entry is added To add an entry to the MAC table, it is required first to setup the entry and then issue a command for the MAC to learn the entry. So if it happens for two threads to add simultaneously an entry in MAC table then it would be a race condition. Fix this by using lan966x->mac_lock to protect the HW access. Fixes: fc0c3fe7486f2 ("net: lan966x: Add function lan966x_mac_ip_learn()") Signed-off-by: Horatiu Vultur Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan966x/lan966x_mac.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c index 2d2b83c03796..4f8fd5cde950 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c @@ -75,6 +75,9 @@ static int __lan966x_mac_learn(struct lan966x *lan966x, int pgid, unsigned int vid, enum macaccess_entry_type type) { + int ret; + + spin_lock(&lan966x->mac_lock); lan966x_mac_select(lan966x, mac, vid); /* Issue a write command */ @@ -86,7 +89,10 @@ static int __lan966x_mac_learn(struct lan966x *lan966x, int pgid, ANA_MACACCESS_MAC_TABLE_CMD_SET(MACACCESS_CMD_LEARN), lan966x, ANA_MACACCESS); - return lan966x_mac_wait_for_completion(lan966x); + ret = lan966x_mac_wait_for_completion(lan966x); + spin_unlock(&lan966x->mac_lock); + + return ret; } /* The mask of the front ports is encoded inside the mac parameter via a call From 99343cfa4f7560abf933fff7ab3ea58a6905c917 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 14 Jul 2022 21:40:38 +0200 Subject: [PATCH 199/248] net: lan966x: Fix usage of lan966x->mac_lock when entry is removed To remove an entry to the MAC table, it is required first to setup the entry and then issue a command for the MAC to forget the entry. So if it happens for two threads to remove simultaneously an entry in MAC table then it would be a race condition. Fix this by using lan966x->mac_lock to protect the HW access. Fixes: e18aba8941b40 ("net: lan966x: add mactable support") Signed-off-by: Horatiu Vultur Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- .../ethernet/microchip/lan966x/lan966x_mac.c | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c index 4f8fd5cde950..d0b8eba0a66d 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c @@ -119,11 +119,13 @@ int lan966x_mac_learn(struct lan966x *lan966x, int port, return __lan966x_mac_learn(lan966x, port, false, mac, vid, type); } -int lan966x_mac_forget(struct lan966x *lan966x, - const unsigned char mac[ETH_ALEN], - unsigned int vid, - enum macaccess_entry_type type) +static int lan966x_mac_forget_locked(struct lan966x *lan966x, + const unsigned char mac[ETH_ALEN], + unsigned int vid, + enum macaccess_entry_type type) { + lockdep_assert_held(&lan966x->mac_lock); + lan966x_mac_select(lan966x, mac, vid); /* Issue a forget command */ @@ -134,6 +136,20 @@ int lan966x_mac_forget(struct lan966x *lan966x, return lan966x_mac_wait_for_completion(lan966x); } +int lan966x_mac_forget(struct lan966x *lan966x, + const unsigned char mac[ETH_ALEN], + unsigned int vid, + enum macaccess_entry_type type) +{ + int ret; + + spin_lock(&lan966x->mac_lock); + ret = lan966x_mac_forget_locked(lan966x, mac, vid, type); + spin_unlock(&lan966x->mac_lock); + + return ret; +} + int lan966x_mac_cpu_learn(struct lan966x *lan966x, const char *addr, u16 vid) { return lan966x_mac_learn(lan966x, PGID_CPU, addr, vid, ENTRYTYPE_LOCKED); From c1924684369762b112428a333ad00eac6ca89d96 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 14 Jul 2022 21:40:39 +0200 Subject: [PATCH 200/248] net: lan966x: Fix usage of lan966x->mac_lock inside lan966x_mac_irq_handler The problem with this spin lock is that it was just protecting the list of the MAC entries in SW and not also the access to the MAC entries in HW. Because the access to HW is indirect, then it could happen to have race conditions. For example when SW introduced an entry in MAC table and the irq mac is trying to read something from the MAC. Update such that also the access to MAC entries in HW is protected by this lock. Fixes: 5ccd66e01cbef ("net: lan966x: add support for interrupts from analyzer") Signed-off-by: Horatiu Vultur Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- .../ethernet/microchip/lan966x/lan966x_mac.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c index d0b8eba0a66d..69e343b7f4af 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c @@ -183,7 +183,7 @@ static struct lan966x_mac_entry *lan966x_mac_alloc_entry(const unsigned char *ma { struct lan966x_mac_entry *mac_entry; - mac_entry = kzalloc(sizeof(*mac_entry), GFP_KERNEL); + mac_entry = kzalloc(sizeof(*mac_entry), GFP_ATOMIC); if (!mac_entry) return NULL; @@ -310,8 +310,8 @@ void lan966x_mac_purge_entries(struct lan966x *lan966x) spin_lock(&lan966x->mac_lock); list_for_each_entry_safe(mac_entry, tmp, &lan966x->mac_entries, list) { - lan966x_mac_forget(lan966x, mac_entry->mac, mac_entry->vid, - ENTRYTYPE_LOCKED); + lan966x_mac_forget_locked(lan966x, mac_entry->mac, + mac_entry->vid, ENTRYTYPE_LOCKED); list_del(&mac_entry->list); kfree(mac_entry); @@ -427,13 +427,14 @@ static void lan966x_mac_irq_process(struct lan966x *lan966x, u32 row, if (WARN_ON(dest_idx >= lan966x->num_phys_ports)) continue; + spin_lock(&lan966x->mac_lock); mac_entry = lan966x_mac_alloc_entry(mac, vid, dest_idx); - if (!mac_entry) + if (!mac_entry) { + spin_unlock(&lan966x->mac_lock); return; + } mac_entry->row = row; - - spin_lock(&lan966x->mac_lock); list_add_tail(&mac_entry->list, &lan966x->mac_entries); spin_unlock(&lan966x->mac_lock); @@ -455,6 +456,7 @@ irqreturn_t lan966x_mac_irq_handler(struct lan966x *lan966x) lan966x, ANA_MACTINDX); while (1) { + spin_lock(&lan966x->mac_lock); lan_rmw(ANA_MACACCESS_MAC_TABLE_CMD_SET(MACACCESS_CMD_SYNC_GET_NEXT), ANA_MACACCESS_MAC_TABLE_CMD, lan966x, ANA_MACACCESS); @@ -478,12 +480,15 @@ irqreturn_t lan966x_mac_irq_handler(struct lan966x *lan966x) stop = false; if (column == LAN966X_MAC_COLUMNS - 1 && - index == 0 && stop) + index == 0 && stop) { + spin_unlock(&lan966x->mac_lock); break; + } entry[column].mach = lan_rd(lan966x, ANA_MACHDATA); entry[column].macl = lan_rd(lan966x, ANA_MACLDATA); entry[column].maca = lan_rd(lan966x, ANA_MACACCESS); + spin_unlock(&lan966x->mac_lock); /* Once all the columns are read process them */ if (column == LAN966X_MAC_COLUMNS - 1) { From 675c807ae26b267233b97cd5006979a6bb8d54d4 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 14 Jul 2022 21:40:40 +0200 Subject: [PATCH 201/248] net: lan966x: Fix usage of lan966x->mac_lock when used by FDB When the SW bridge was trying to add/remove entries to/from HW, the access to HW was not protected by any lock. In this way, it was possible to have race conditions. Fix this by using the lan966x->mac_lock to protect parallel access to HW for this cases. Fixes: 25ee9561ec622 ("net: lan966x: More MAC table functionality") Signed-off-by: Horatiu Vultur Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- .../ethernet/microchip/lan966x/lan966x_mac.c | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c index 69e343b7f4af..5893770bfd94 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c @@ -201,7 +201,6 @@ static struct lan966x_mac_entry *lan966x_mac_find_entry(struct lan966x *lan966x, struct lan966x_mac_entry *res = NULL; struct lan966x_mac_entry *mac_entry; - spin_lock(&lan966x->mac_lock); list_for_each_entry(mac_entry, &lan966x->mac_entries, list) { if (mac_entry->vid == vid && ether_addr_equal(mac, mac_entry->mac) && @@ -210,7 +209,6 @@ static struct lan966x_mac_entry *lan966x_mac_find_entry(struct lan966x *lan966x, break; } } - spin_unlock(&lan966x->mac_lock); return res; } @@ -253,8 +251,11 @@ int lan966x_mac_add_entry(struct lan966x *lan966x, struct lan966x_port *port, { struct lan966x_mac_entry *mac_entry; - if (lan966x_mac_lookup(lan966x, addr, vid, ENTRYTYPE_NORMAL)) + spin_lock(&lan966x->mac_lock); + if (lan966x_mac_lookup(lan966x, addr, vid, ENTRYTYPE_NORMAL)) { + spin_unlock(&lan966x->mac_lock); return 0; + } /* In case the entry already exists, don't add it again to SW, * just update HW, but we need to look in the actual HW because @@ -263,21 +264,25 @@ int lan966x_mac_add_entry(struct lan966x *lan966x, struct lan966x_port *port, * add the entry but without the extern_learn flag. */ mac_entry = lan966x_mac_find_entry(lan966x, addr, vid, port->chip_port); - if (mac_entry) - return lan966x_mac_learn(lan966x, port->chip_port, - addr, vid, ENTRYTYPE_LOCKED); + if (mac_entry) { + spin_unlock(&lan966x->mac_lock); + goto mac_learn; + } mac_entry = lan966x_mac_alloc_entry(addr, vid, port->chip_port); - if (!mac_entry) + if (!mac_entry) { + spin_unlock(&lan966x->mac_lock); return -ENOMEM; + } - spin_lock(&lan966x->mac_lock); list_add_tail(&mac_entry->list, &lan966x->mac_entries); spin_unlock(&lan966x->mac_lock); - lan966x_mac_learn(lan966x, port->chip_port, addr, vid, ENTRYTYPE_LOCKED); lan966x_fdb_call_notifiers(SWITCHDEV_FDB_OFFLOADED, addr, vid, port->dev); +mac_learn: + lan966x_mac_learn(lan966x, port->chip_port, addr, vid, ENTRYTYPE_LOCKED); + return 0; } @@ -291,8 +296,9 @@ int lan966x_mac_del_entry(struct lan966x *lan966x, const unsigned char *addr, list) { if (mac_entry->vid == vid && ether_addr_equal(addr, mac_entry->mac)) { - lan966x_mac_forget(lan966x, mac_entry->mac, mac_entry->vid, - ENTRYTYPE_LOCKED); + lan966x_mac_forget_locked(lan966x, mac_entry->mac, + mac_entry->vid, + ENTRYTYPE_LOCKED); list_del(&mac_entry->list); kfree(mac_entry); @@ -428,6 +434,12 @@ static void lan966x_mac_irq_process(struct lan966x *lan966x, u32 row, continue; spin_lock(&lan966x->mac_lock); + mac_entry = lan966x_mac_find_entry(lan966x, mac, vid, dest_idx); + if (mac_entry) { + spin_unlock(&lan966x->mac_lock); + continue; + } + mac_entry = lan966x_mac_alloc_entry(mac, vid, dest_idx); if (!mac_entry) { spin_unlock(&lan966x->mac_lock); From 3696c952da0733b843c8da3441345055b1cbacd9 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 16 Jul 2022 17:46:54 -0400 Subject: [PATCH 202/248] net: ethernet: mtk_eth_soc: fix off by one check of ARRAY_SIZE In mtk_wed_tx_ring_setup(.., int idx, ..), idx is used as an index here struct mtk_wed_ring *ring = &dev->tx_ring[idx]; The bounds of idx are checked here BUG_ON(idx > ARRAY_SIZE(dev->tx_ring)); If idx is the size of the array, it will pass this check and overflow. So change the check to >= . Fixes: 804775dfc288 ("net: ethernet: mtk_eth_soc: add support for Wireless Ethernet Dispatch (WED)") Signed-off-by: Tom Rix Link: https://lore.kernel.org/r/20220716214654.1540240-1-trix@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_wed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index 8f0cd3196aac..29be2fcafea3 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -651,7 +651,7 @@ mtk_wed_tx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs) * WDMA RX. */ - BUG_ON(idx > ARRAY_SIZE(dev->tx_ring)); + BUG_ON(idx >= ARRAY_SIZE(dev->tx_ring)); if (mtk_wed_ring_alloc(dev, ring, MTK_WED_TX_RING_SIZE)) return -ENOMEM; From f838a63369818faadec4ad1736cfbd20ab5da00e Mon Sep 17 00:00:00 2001 From: Dawid Lukwinski Date: Fri, 15 Jul 2022 14:45:41 -0700 Subject: [PATCH 203/248] i40e: Fix erroneous adapter reinitialization during recovery process Fix an issue when driver incorrectly detects state of recovery process and erroneously reinitializes interrupts, which results in a kernel error and call trace message. The issue was caused by a combination of two factors: 1. Assuming the EMP reset issued after completing firmware recovery means the whole recovery process is complete. 2. Erroneous reinitialization of interrupt vector after detecting the above mentioned EMP reset. Fixes (1) by changing how recovery state change is detected and (2) by adjusting the conditional expression to ensure using proper interrupt reinitialization method, depending on the situation. Fixes: 4ff0ee1af016 ("i40e: Introduce recovery mode support") Signed-off-by: Dawid Lukwinski Signed-off-by: Jan Sokolowski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20220715214542.2968762-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index aa786fd55951..7f1a0d90dc51 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -10650,7 +10650,7 @@ static int i40e_reset(struct i40e_pf *pf) **/ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) { - int old_recovery_mode_bit = test_bit(__I40E_RECOVERY_MODE, pf->state); + const bool is_recovery_mode_reported = i40e_check_recovery_mode(pf); struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi]; struct i40e_hw *hw = &pf->hw; i40e_status ret; @@ -10658,13 +10658,11 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) int v; if (test_bit(__I40E_EMP_RESET_INTR_RECEIVED, pf->state) && - i40e_check_recovery_mode(pf)) { + is_recovery_mode_reported) i40e_set_ethtool_ops(pf->vsi[pf->lan_vsi]->netdev); - } if (test_bit(__I40E_DOWN, pf->state) && - !test_bit(__I40E_RECOVERY_MODE, pf->state) && - !old_recovery_mode_bit) + !test_bit(__I40E_RECOVERY_MODE, pf->state)) goto clear_recovery; dev_dbg(&pf->pdev->dev, "Rebuilding internal switch\n"); @@ -10691,13 +10689,12 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) * accordingly with regard to resources initialization * and deinitialization */ - if (test_bit(__I40E_RECOVERY_MODE, pf->state) || - old_recovery_mode_bit) { + if (test_bit(__I40E_RECOVERY_MODE, pf->state)) { if (i40e_get_capabilities(pf, i40e_aqc_opc_list_func_capabilities)) goto end_unlock; - if (test_bit(__I40E_RECOVERY_MODE, pf->state)) { + if (is_recovery_mode_reported) { /* we're staying in recovery mode so we'll reinitialize * misc vector here */ From 1e53834ce541d4fe271cdcca7703e50be0a44f8a Mon Sep 17 00:00:00 2001 From: Piotr Skajewski Date: Fri, 15 Jul 2022 14:44:56 -0700 Subject: [PATCH 204/248] ixgbe: Add locking to prevent panic when setting sriov_numvfs to zero It is possible to disable VFs while the PF driver is processing requests from the VF driver. This can result in a panic. BUG: unable to handle kernel paging request at 000000000000106c PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 8 PID: 0 Comm: swapper/8 Kdump: loaded Tainted: G I --------- - Hardware name: Dell Inc. PowerEdge R740/06WXJT, BIOS 2.8.2 08/27/2020 RIP: 0010:ixgbe_msg_task+0x4c8/0x1690 [ixgbe] Code: 00 00 48 8d 04 40 48 c1 e0 05 89 7c 24 24 89 fd 48 89 44 24 10 83 ff 01 0f 84 b8 04 00 00 4c 8b 64 24 10 4d 03 a5 48 22 00 00 <41> 80 7c 24 4c 00 0f 84 8a 03 00 00 0f b7 c7 83 f8 08 0f 84 8f 0a RSP: 0018:ffffb337869f8df8 EFLAGS: 00010002 RAX: 0000000000001020 RBX: 0000000000000000 RCX: 000000000000002b RDX: 0000000000000002 RSI: 0000000000000008 RDI: 0000000000000006 RBP: 0000000000000006 R08: 0000000000000002 R09: 0000000000029780 R10: 00006957d8f42832 R11: 0000000000000000 R12: 0000000000001020 R13: ffff8a00e8978ac0 R14: 000000000000002b R15: ffff8a00e8979c80 FS: 0000000000000000(0000) GS:ffff8a07dfd00000(0000) knlGS:00000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000106c CR3: 0000000063e10004 CR4: 00000000007726e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? ttwu_do_wakeup+0x19/0x140 ? try_to_wake_up+0x1cd/0x550 ? ixgbevf_update_xcast_mode+0x71/0xc0 [ixgbevf] ixgbe_msix_other+0x17e/0x310 [ixgbe] __handle_irq_event_percpu+0x40/0x180 handle_irq_event_percpu+0x30/0x80 handle_irq_event+0x36/0x53 handle_edge_irq+0x82/0x190 handle_irq+0x1c/0x30 do_IRQ+0x49/0xd0 common_interrupt+0xf/0xf This can be eventually be reproduced with the following script: while : do echo 63 > /sys/class/net//device/sriov_numvfs sleep 1 echo 0 > /sys/class/net//device/sriov_numvfs sleep 1 done Add lock when disabling SR-IOV to prevent process VF mailbox communication. Fixes: d773d1310625 ("ixgbe: Fix memory leak when SR-IOV VFs are direct assigned") Signed-off-by: Piotr Skajewski Tested-by: Marek Szlosek Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20220715214456.2968711-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 +++ drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 921a4d977d65..8813b4dd6872 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -779,6 +779,7 @@ struct ixgbe_adapter { #ifdef CONFIG_IXGBE_IPSEC struct ixgbe_ipsec *ipsec; #endif /* CONFIG_IXGBE_IPSEC */ + spinlock_t vfs_lock; }; static inline int ixgbe_determine_xdp_q_idx(int cpu) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 77c2e70b0860..55f91c9ff047 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6403,6 +6403,9 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter, /* n-tuple support exists, always init our spinlock */ spin_lock_init(&adapter->fdir_perfect_lock); + /* init spinlock to avoid concurrency of VF resources */ + spin_lock_init(&adapter->vfs_lock); + #ifdef CONFIG_IXGBE_DCB ixgbe_init_dcb(adapter); #endif diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index d4e63f0644c3..a1e69c734863 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -205,10 +205,13 @@ void ixgbe_enable_sriov(struct ixgbe_adapter *adapter, unsigned int max_vfs) int ixgbe_disable_sriov(struct ixgbe_adapter *adapter) { unsigned int num_vfs = adapter->num_vfs, vf; + unsigned long flags; int rss; + spin_lock_irqsave(&adapter->vfs_lock, flags); /* set num VFs to 0 to prevent access to vfinfo */ adapter->num_vfs = 0; + spin_unlock_irqrestore(&adapter->vfs_lock, flags); /* put the reference to all of the vf devices */ for (vf = 0; vf < num_vfs; ++vf) { @@ -1355,8 +1358,10 @@ static void ixgbe_rcv_ack_from_vf(struct ixgbe_adapter *adapter, u32 vf) void ixgbe_msg_task(struct ixgbe_adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; + unsigned long flags; u32 vf; + spin_lock_irqsave(&adapter->vfs_lock, flags); for (vf = 0; vf < adapter->num_vfs; vf++) { /* process any reset requests */ if (!ixgbe_check_for_rst(hw, vf)) @@ -1370,6 +1375,7 @@ void ixgbe_msg_task(struct ixgbe_adapter *adapter) if (!ixgbe_check_for_ack(hw, vf)) ixgbe_rcv_ack_from_vf(adapter, vf); } + spin_unlock_irqrestore(&adapter->vfs_lock, flags); } static inline void ixgbe_ping_vf(struct ixgbe_adapter *adapter, int vf) From 4db2a5ef4ccbe6d138828284cfab241b434b5d95 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 15 Jul 2022 18:16:58 +0300 Subject: [PATCH 205/248] net: dsa: fix dsa_port_vlan_filtering when global The blamed refactoring commit changed a "port" iterator with "other_dp", but still looked at the slave_dev of the dp outside the loop, instead of other_dp->slave from the loop. As a result, dsa_port_vlan_filtering() would not call dsa_slave_manage_vlan_filtering() except for the port in cause, and not for all switch ports as expected. Fixes: d0004a020bb5 ("net: dsa: remove the "dsa_to_port in a loop" antipattern from the core") Reported-by: Lucian Banu Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/dsa/port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/port.c b/net/dsa/port.c index 3738f2d40a0b..a4052174ac50 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -799,7 +799,7 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, ds->vlan_filtering = vlan_filtering; dsa_switch_for_each_user_port(other_dp, ds) { - struct net_device *slave = dp->slave; + struct net_device *slave = other_dp->slave; /* We might be called in the unbind path, so not * all slave devices might still be registered. From 1699b4d502eda3c7ea4070debad3ee570b5091b1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 15 Jul 2022 18:16:59 +0300 Subject: [PATCH 206/248] net: dsa: fix NULL pointer dereference in dsa_port_reset_vlan_filtering The "ds" iterator variable used in dsa_port_reset_vlan_filtering() -> dsa_switch_for_each_port() overwrites the "dp" received as argument, which is later used to call dsa_port_vlan_filtering() proper. As a result, switches which do enter that code path (the ones with vlan_filtering_is_global=true) will dereference an invalid dp in dsa_port_reset_vlan_filtering() after leaving a VLAN-aware bridge. Use a dedicated "other_dp" iterator variable to avoid this from happening. Fixes: d0004a020bb5 ("net: dsa: remove the "dsa_to_port in a loop" antipattern from the core") Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/dsa/port.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/dsa/port.c b/net/dsa/port.c index a4052174ac50..2dd76eb1621c 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -248,6 +248,7 @@ static void dsa_port_reset_vlan_filtering(struct dsa_port *dp, struct netlink_ext_ack extack = {0}; bool change_vlan_filtering = false; struct dsa_switch *ds = dp->ds; + struct dsa_port *other_dp; bool vlan_filtering; int err; @@ -270,8 +271,8 @@ static void dsa_port_reset_vlan_filtering(struct dsa_port *dp, * VLAN-aware bridge. */ if (change_vlan_filtering && ds->vlan_filtering_is_global) { - dsa_switch_for_each_port(dp, ds) { - struct net_device *br = dsa_port_bridge_dev_get(dp); + dsa_switch_for_each_port(other_dp, ds) { + struct net_device *br = dsa_port_bridge_dev_get(other_dp); if (br && br_vlan_enabled(br)) { change_vlan_filtering = false; From da791bac104a3169b05b54270afe75daacba4641 Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Fri, 15 Jul 2022 20:24:02 +0800 Subject: [PATCH 207/248] net: stmmac: remove redunctant disable xPCS EEE call Disable is done in stmmac_init_eee() on the event of MAC link down. Since setting enable/disable EEE via ethtool will eventually trigger a MAC down, removing this redunctant call in stmmac_ethtool.c to avoid calling xpcs_config_eee() twice. Fixes: d4aeaed80b0e ("net: stmmac: trigger PCS EEE to turn off on link down") Signed-off-by: Wong Vee Khee Link: https://lore.kernel.org/r/20220715122402.1017470-1-vee.khee.wong@linux.intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index abfb3cd5958d..9c3055ee2608 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -803,14 +803,6 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev, netdev_warn(priv->dev, "Setting EEE tx-lpi is not supported\n"); - if (priv->hw->xpcs) { - ret = xpcs_config_eee(priv->hw->xpcs, - priv->plat->mult_fact_100ns, - edata->eee_enabled); - if (ret) - return ret; - } - if (!edata->eee_enabled) stmmac_disable_eee_mode(priv); From d7241f679a59cfe27f92cb5c6272cb429fb1f7ec Mon Sep 17 00:00:00 2001 From: Hristo Venev Date: Sat, 16 Jul 2022 11:51:34 +0300 Subject: [PATCH 208/248] be2net: Fix buffer overflow in be_get_module_eeprom be_cmd_read_port_transceiver_data assumes that it is given a buffer that is at least PAGE_DATA_LEN long, or twice that if the module supports SFF 8472. However, this is not always the case. Fix this by passing the desired offset and length to be_cmd_read_port_transceiver_data so that we only copy the bytes once. Fixes: e36edd9d26cf ("be2net: add ethtool "-m" option support") Signed-off-by: Hristo Venev Link: https://lore.kernel.org/r/20220716085134.6095-1-hristo@venev.name Signed-off-by: Paolo Abeni --- drivers/net/ethernet/emulex/benet/be_cmds.c | 10 +++--- drivers/net/ethernet/emulex/benet/be_cmds.h | 2 +- .../net/ethernet/emulex/benet/be_ethtool.c | 31 ++++++++++++------- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c index 528eb0f223b1..b4f5e57d0285 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.c +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -2287,7 +2287,7 @@ err: /* Uses sync mcc */ int be_cmd_read_port_transceiver_data(struct be_adapter *adapter, - u8 page_num, u8 *data) + u8 page_num, u32 off, u32 len, u8 *data) { struct be_dma_mem cmd; struct be_mcc_wrb *wrb; @@ -2321,10 +2321,10 @@ int be_cmd_read_port_transceiver_data(struct be_adapter *adapter, req->port = cpu_to_le32(adapter->hba_port_num); req->page_num = cpu_to_le32(page_num); status = be_mcc_notify_wait(adapter); - if (!status) { + if (!status && len > 0) { struct be_cmd_resp_port_type *resp = cmd.va; - memcpy(data, resp->page_data, PAGE_DATA_LEN); + memcpy(data, resp->page_data + off, len); } err: mutex_unlock(&adapter->mcc_lock); @@ -2415,7 +2415,7 @@ int be_cmd_query_cable_type(struct be_adapter *adapter) int status; status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, - page_data); + 0, PAGE_DATA_LEN, page_data); if (!status) { switch (adapter->phy.interface_type) { case PHY_TYPE_QSFP: @@ -2440,7 +2440,7 @@ int be_cmd_query_sfp_info(struct be_adapter *adapter) int status; status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, - page_data); + 0, PAGE_DATA_LEN, page_data); if (!status) { strlcpy(adapter->phy.vendor_name, page_data + SFP_VENDOR_NAME_OFFSET, SFP_VENDOR_NAME_LEN - 1); diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h index db1f3b908582..e2085c68c0ee 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.h +++ b/drivers/net/ethernet/emulex/benet/be_cmds.h @@ -2427,7 +2427,7 @@ int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num, u8 beacon, int be_cmd_get_beacon_state(struct be_adapter *adapter, u8 port_num, u32 *state); int be_cmd_read_port_transceiver_data(struct be_adapter *adapter, - u8 page_num, u8 *data); + u8 page_num, u32 off, u32 len, u8 *data); int be_cmd_query_cable_type(struct be_adapter *adapter); int be_cmd_query_sfp_info(struct be_adapter *adapter); int lancer_cmd_read_object(struct be_adapter *adapter, struct be_dma_mem *cmd, diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c index dfa784339781..bd0df189d871 100644 --- a/drivers/net/ethernet/emulex/benet/be_ethtool.c +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -1344,7 +1344,7 @@ static int be_get_module_info(struct net_device *netdev, return -EOPNOTSUPP; status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, - page_data); + 0, PAGE_DATA_LEN, page_data); if (!status) { if (!page_data[SFP_PLUS_SFF_8472_COMP]) { modinfo->type = ETH_MODULE_SFF_8079; @@ -1362,25 +1362,32 @@ static int be_get_module_eeprom(struct net_device *netdev, { struct be_adapter *adapter = netdev_priv(netdev); int status; + u32 begin, end; if (!check_privilege(adapter, MAX_PRIVILEGES)) return -EOPNOTSUPP; - status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, - data); - if (status) - goto err; + begin = eeprom->offset; + end = eeprom->offset + eeprom->len; - if (eeprom->offset + eeprom->len > PAGE_DATA_LEN) { - status = be_cmd_read_port_transceiver_data(adapter, - TR_PAGE_A2, - data + - PAGE_DATA_LEN); + if (begin < PAGE_DATA_LEN) { + status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, begin, + min_t(u32, end, PAGE_DATA_LEN) - begin, + data); + if (status) + goto err; + + data += PAGE_DATA_LEN - begin; + begin = PAGE_DATA_LEN; + } + + if (end > PAGE_DATA_LEN) { + status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A2, + begin - PAGE_DATA_LEN, + end - begin, data); if (status) goto err; } - if (eeprom->offset) - memcpy(data, data + eeprom->offset, eeprom->len); err: return be_cmd_status(status); } From 855fe49984a8a3899f07ae1d149d46cd8d4acb52 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Sun, 17 Jul 2022 15:58:30 +0200 Subject: [PATCH 209/248] net: dsa: sja1105: silent spi_device_id warnings Add spi_device_id entries to silent following warnings: SPI driver sja1105 has no spi_device_id for nxp,sja1105e SPI driver sja1105 has no spi_device_id for nxp,sja1105t SPI driver sja1105 has no spi_device_id for nxp,sja1105p SPI driver sja1105 has no spi_device_id for nxp,sja1105q SPI driver sja1105 has no spi_device_id for nxp,sja1105r SPI driver sja1105 has no spi_device_id for nxp,sja1105s SPI driver sja1105 has no spi_device_id for nxp,sja1110a SPI driver sja1105 has no spi_device_id for nxp,sja1110b SPI driver sja1105 has no spi_device_id for nxp,sja1110c SPI driver sja1105 has no spi_device_id for nxp,sja1110d Fixes: 5fa6863ba692 ("spi: Check we have a spi_device_id for each DT compatible") Signed-off-by: Oleksij Rempel Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20220717135831.2492844-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- drivers/net/dsa/sja1105/sja1105_main.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 72b6fc1932b5..698c7d1fb45c 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -3382,12 +3382,28 @@ static const struct of_device_id sja1105_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, sja1105_dt_ids); +static const struct spi_device_id sja1105_spi_ids[] = { + { "sja1105e" }, + { "sja1105t" }, + { "sja1105p" }, + { "sja1105q" }, + { "sja1105r" }, + { "sja1105s" }, + { "sja1110a" }, + { "sja1110b" }, + { "sja1110c" }, + { "sja1110d" }, + { }, +}; +MODULE_DEVICE_TABLE(spi, sja1105_spi_ids); + static struct spi_driver sja1105_driver = { .driver = { .name = "sja1105", .owner = THIS_MODULE, .of_match_table = of_match_ptr(sja1105_dt_ids), }, + .id_table = sja1105_spi_ids, .probe = sja1105_probe, .remove = sja1105_remove, .shutdown = sja1105_shutdown, From 1774559f07993e1cac33c2406e99049d4bdea6c8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Sun, 17 Jul 2022 15:58:31 +0200 Subject: [PATCH 210/248] net: dsa: vitesse-vsc73xx: silent spi_device_id warnings Add spi_device_id entries to silent SPI warnings. Fixes: 5fa6863ba692 ("spi: Check we have a spi_device_id for each DT compatible") Signed-off-by: Oleksij Rempel Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220717135831.2492844-2-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- drivers/net/dsa/vitesse-vsc73xx-spi.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/dsa/vitesse-vsc73xx-spi.c b/drivers/net/dsa/vitesse-vsc73xx-spi.c index 3110895358d8..97a92e6da60d 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-spi.c +++ b/drivers/net/dsa/vitesse-vsc73xx-spi.c @@ -205,10 +205,20 @@ static const struct of_device_id vsc73xx_of_match[] = { }; MODULE_DEVICE_TABLE(of, vsc73xx_of_match); +static const struct spi_device_id vsc73xx_spi_ids[] = { + { "vsc7385" }, + { "vsc7388" }, + { "vsc7395" }, + { "vsc7398" }, + { }, +}; +MODULE_DEVICE_TABLE(spi, vsc73xx_spi_ids); + static struct spi_driver vsc73xx_spi_driver = { .probe = vsc73xx_spi_probe, .remove = vsc73xx_spi_remove, .shutdown = vsc73xx_spi_shutdown, + .id_table = vsc73xx_spi_ids, .driver = { .name = "vsc73xx-spi", .of_match_table = vsc73xx_of_match, From 30e22a6ebca039572ce9bc10f1934f4eabfb5b7f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 17 Jul 2022 16:09:03 +0000 Subject: [PATCH 211/248] amt: use workqueue for gateway side message handling There are some synchronization issues(amt->status, amt->req_cnt, etc) if the interface is in gateway mode because gateway message handlers are processed concurrently. This applies a work queue for processing these messages instead of expanding the locking context. So, the purposes of this patch are to fix exist race conditions and to make gateway to be able to validate a gateway status more correctly. When the AMT gateway interface is created, it tries to establish to relay. The establishment step looks stateless, but it should be managed well. In order to handle messages in the gateway, it saves the current status(i.e. AMT_STATUS_XXX). This patch makes gateway code to be worked with a single thread. Now, all messages except the multicast are triggered(received or delay expired), and these messages will be stored in the event queue(amt->events). Then, the single worker processes stored messages asynchronously one by one. The multicast data message type will be still processed immediately. Now, amt->lock is only needed to access the event queue(amt->events) if an interface is the gateway mode. Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Paolo Abeni --- drivers/net/amt.c | 159 +++++++++++++++++++++++++++++++++++++++++----- include/net/amt.h | 20 ++++++ 2 files changed, 164 insertions(+), 15 deletions(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 89563d1b2a3b..9e2d1992b349 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -900,6 +900,28 @@ static void amt_send_mld_gq(struct amt_dev *amt, struct amt_tunnel_list *tunnel) } #endif +static bool amt_queue_event(struct amt_dev *amt, enum amt_event event, + struct sk_buff *skb) +{ + int index; + + spin_lock_bh(&amt->lock); + if (amt->nr_events >= AMT_MAX_EVENTS) { + spin_unlock_bh(&amt->lock); + return 1; + } + + index = (amt->event_idx + amt->nr_events) % AMT_MAX_EVENTS; + amt->events[index].event = event; + amt->events[index].skb = skb; + amt->nr_events++; + amt->event_idx %= AMT_MAX_EVENTS; + queue_work(amt_wq, &amt->event_wq); + spin_unlock_bh(&amt->lock); + + return 0; +} + static void amt_secret_work(struct work_struct *work) { struct amt_dev *amt = container_of(to_delayed_work(work), @@ -913,12 +935,8 @@ static void amt_secret_work(struct work_struct *work) msecs_to_jiffies(AMT_SECRET_TIMEOUT)); } -static void amt_discovery_work(struct work_struct *work) +static void amt_event_send_discovery(struct amt_dev *amt) { - struct amt_dev *amt = container_of(to_delayed_work(work), - struct amt_dev, - discovery_wq); - spin_lock_bh(&amt->lock); if (amt->status > AMT_STATUS_SENT_DISCOVERY) goto out; @@ -933,11 +951,19 @@ out: spin_unlock_bh(&amt->lock); } -static void amt_req_work(struct work_struct *work) +static void amt_discovery_work(struct work_struct *work) { struct amt_dev *amt = container_of(to_delayed_work(work), struct amt_dev, - req_wq); + discovery_wq); + + if (amt_queue_event(amt, AMT_EVENT_SEND_DISCOVERY, NULL)) + mod_delayed_work(amt_wq, &amt->discovery_wq, + msecs_to_jiffies(AMT_DISCOVERY_TIMEOUT)); +} + +static void amt_event_send_request(struct amt_dev *amt) +{ u32 exp; spin_lock_bh(&amt->lock); @@ -967,6 +993,17 @@ out: spin_unlock_bh(&amt->lock); } +static void amt_req_work(struct work_struct *work) +{ + struct amt_dev *amt = container_of(to_delayed_work(work), + struct amt_dev, + req_wq); + + if (amt_queue_event(amt, AMT_EVENT_SEND_REQUEST, NULL)) + mod_delayed_work(amt_wq, &amt->req_wq, + msecs_to_jiffies(100)); +} + static bool amt_send_membership_update(struct amt_dev *amt, struct sk_buff *skb, bool v6) @@ -2392,12 +2429,14 @@ static bool amt_membership_query_handler(struct amt_dev *amt, skb->pkt_type = PACKET_MULTICAST; skb->ip_summed = CHECKSUM_NONE; len = skb->len; + local_bh_disable(); if (__netif_rx(skb) == NET_RX_SUCCESS) { amt_update_gw_status(amt, AMT_STATUS_RECEIVED_QUERY, true); dev_sw_netstats_rx_add(amt->dev, len); } else { amt->dev->stats.rx_dropped++; } + local_bh_enable(); return false; } @@ -2688,6 +2727,38 @@ send: return false; } +static void amt_gw_rcv(struct amt_dev *amt, struct sk_buff *skb) +{ + int type = amt_parse_type(skb); + int err = 1; + + if (type == -1) + goto drop; + + if (amt->mode == AMT_MODE_GATEWAY) { + switch (type) { + case AMT_MSG_ADVERTISEMENT: + err = amt_advertisement_handler(amt, skb); + break; + case AMT_MSG_MEMBERSHIP_QUERY: + err = amt_membership_query_handler(amt, skb); + if (!err) + return; + break; + default: + netdev_dbg(amt->dev, "Invalid type of Gateway\n"); + break; + } + } +drop: + if (err) { + amt->dev->stats.rx_dropped++; + kfree_skb(skb); + } else { + consume_skb(skb); + } +} + static int amt_rcv(struct sock *sk, struct sk_buff *skb) { struct amt_dev *amt; @@ -2719,8 +2790,12 @@ static int amt_rcv(struct sock *sk, struct sk_buff *skb) err = true; goto drop; } - err = amt_advertisement_handler(amt, skb); - break; + if (amt_queue_event(amt, AMT_EVENT_RECEIVE, skb)) { + netdev_dbg(amt->dev, "AMT Event queue full\n"); + err = true; + goto drop; + } + goto out; case AMT_MSG_MULTICAST_DATA: if (iph->saddr != amt->remote_ip) { netdev_dbg(amt->dev, "Invalid Relay IP\n"); @@ -2738,11 +2813,12 @@ static int amt_rcv(struct sock *sk, struct sk_buff *skb) err = true; goto drop; } - err = amt_membership_query_handler(amt, skb); - if (err) + if (amt_queue_event(amt, AMT_EVENT_RECEIVE, skb)) { + netdev_dbg(amt->dev, "AMT Event queue full\n"); + err = true; goto drop; - else - goto out; + } + goto out; default: err = true; netdev_dbg(amt->dev, "Invalid type of Gateway\n"); @@ -2780,6 +2856,46 @@ out: return 0; } +static void amt_event_work(struct work_struct *work) +{ + struct amt_dev *amt = container_of(work, struct amt_dev, event_wq); + struct sk_buff *skb; + u8 event; + int i; + + for (i = 0; i < AMT_MAX_EVENTS; i++) { + spin_lock_bh(&amt->lock); + if (amt->nr_events == 0) { + spin_unlock_bh(&amt->lock); + return; + } + event = amt->events[amt->event_idx].event; + skb = amt->events[amt->event_idx].skb; + amt->events[amt->event_idx].event = AMT_EVENT_NONE; + amt->events[amt->event_idx].skb = NULL; + amt->nr_events--; + amt->event_idx++; + amt->event_idx %= AMT_MAX_EVENTS; + spin_unlock_bh(&amt->lock); + + switch (event) { + case AMT_EVENT_RECEIVE: + amt_gw_rcv(amt, skb); + break; + case AMT_EVENT_SEND_DISCOVERY: + amt_event_send_discovery(amt); + break; + case AMT_EVENT_SEND_REQUEST: + amt_event_send_request(amt); + break; + default: + if (skb) + kfree_skb(skb); + break; + } + } +} + static int amt_err_lookup(struct sock *sk, struct sk_buff *skb) { struct amt_dev *amt; @@ -2867,6 +2983,8 @@ static int amt_dev_open(struct net_device *dev) amt->ready4 = false; amt->ready6 = false; + amt->event_idx = 0; + amt->nr_events = 0; err = amt_socket_create(amt); if (err) @@ -2892,6 +3010,8 @@ static int amt_dev_stop(struct net_device *dev) struct amt_dev *amt = netdev_priv(dev); struct amt_tunnel_list *tunnel, *tmp; struct socket *sock; + struct sk_buff *skb; + int i; cancel_delayed_work_sync(&amt->req_wq); cancel_delayed_work_sync(&amt->discovery_wq); @@ -2904,6 +3024,15 @@ static int amt_dev_stop(struct net_device *dev) if (sock) udp_tunnel_sock_release(sock); + cancel_work_sync(&amt->event_wq); + for (i = 0; i < AMT_MAX_EVENTS; i++) { + skb = amt->events[i].skb; + if (skb) + kfree_skb(skb); + amt->events[i].event = AMT_EVENT_NONE; + amt->events[i].skb = NULL; + } + amt->ready4 = false; amt->ready6 = false; amt->req_cnt = 0; @@ -3146,8 +3275,8 @@ static int amt_newlink(struct net *net, struct net_device *dev, INIT_DELAYED_WORK(&amt->discovery_wq, amt_discovery_work); INIT_DELAYED_WORK(&amt->req_wq, amt_req_work); INIT_DELAYED_WORK(&amt->secret_wq, amt_secret_work); + INIT_WORK(&amt->event_wq, amt_event_work); INIT_LIST_HEAD(&amt->tunnel_list); - return 0; err: dev_put(amt->stream_dev); @@ -3280,7 +3409,7 @@ static int __init amt_init(void) if (err < 0) goto unregister_notifier; - amt_wq = alloc_workqueue("amt", WQ_UNBOUND, 1); + amt_wq = alloc_workqueue("amt", WQ_UNBOUND, 0); if (!amt_wq) { err = -ENOMEM; goto rtnl_unregister; diff --git a/include/net/amt.h b/include/net/amt.h index 0e40c3d64fcf..08fc30cf2f34 100644 --- a/include/net/amt.h +++ b/include/net/amt.h @@ -78,6 +78,15 @@ enum amt_status { #define AMT_STATUS_MAX (__AMT_STATUS_MAX - 1) +/* Gateway events only */ +enum amt_event { + AMT_EVENT_NONE, + AMT_EVENT_RECEIVE, + AMT_EVENT_SEND_DISCOVERY, + AMT_EVENT_SEND_REQUEST, + __AMT_EVENT_MAX, +}; + struct amt_header { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 type:4, @@ -292,6 +301,12 @@ struct amt_group_node { struct hlist_head sources[]; }; +#define AMT_MAX_EVENTS 16 +struct amt_events { + enum amt_event event; + struct sk_buff *skb; +}; + struct amt_dev { struct net_device *dev; struct net_device *stream_dev; @@ -308,6 +323,7 @@ struct amt_dev { struct delayed_work req_wq; /* Protected by RTNL */ struct delayed_work secret_wq; + struct work_struct event_wq; /* AMT status */ enum amt_status status; /* Generated key */ @@ -345,6 +361,10 @@ struct amt_dev { /* Used only in gateway mode */ u64 mac:48, reserved:16; + /* AMT gateway side message handler queue */ + struct amt_events events[AMT_MAX_EVENTS]; + u8 event_idx; + u8 nr_events; }; #define AMT_TOS 0xc0 From 9c343ea6185febe5f6b74f7f7b3757f3dd9c5af6 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 17 Jul 2022 16:09:04 +0000 Subject: [PATCH 212/248] amt: remove unnecessary locks By the previous patch, amt gateway handlers are changed to worked by a single thread. So, most locks for gateway are not needed. So, it removes. Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Paolo Abeni --- drivers/net/amt.c | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 9e2d1992b349..1f41ba526f3e 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -577,8 +577,8 @@ static struct sk_buff *amt_build_igmp_gq(struct amt_dev *amt) return skb; } -static void __amt_update_gw_status(struct amt_dev *amt, enum amt_status status, - bool validate) +static void amt_update_gw_status(struct amt_dev *amt, enum amt_status status, + bool validate) { if (validate && amt->status >= status) return; @@ -600,14 +600,6 @@ static void __amt_update_relay_status(struct amt_tunnel_list *tunnel, tunnel->status = status; } -static void amt_update_gw_status(struct amt_dev *amt, enum amt_status status, - bool validate) -{ - spin_lock_bh(&amt->lock); - __amt_update_gw_status(amt, status, validate); - spin_unlock_bh(&amt->lock); -} - static void amt_update_relay_status(struct amt_tunnel_list *tunnel, enum amt_status status, bool validate) { @@ -700,9 +692,7 @@ static void amt_send_discovery(struct amt_dev *amt) if (unlikely(net_xmit_eval(err))) amt->dev->stats.tx_errors++; - spin_lock_bh(&amt->lock); - __amt_update_gw_status(amt, AMT_STATUS_SENT_DISCOVERY, true); - spin_unlock_bh(&amt->lock); + amt_update_gw_status(amt, AMT_STATUS_SENT_DISCOVERY, true); out: rcu_read_unlock(); } @@ -937,18 +927,14 @@ static void amt_secret_work(struct work_struct *work) static void amt_event_send_discovery(struct amt_dev *amt) { - spin_lock_bh(&amt->lock); if (amt->status > AMT_STATUS_SENT_DISCOVERY) goto out; get_random_bytes(&amt->nonce, sizeof(__be32)); - spin_unlock_bh(&amt->lock); amt_send_discovery(amt); - spin_lock_bh(&amt->lock); out: mod_delayed_work(amt_wq, &amt->discovery_wq, msecs_to_jiffies(AMT_DISCOVERY_TIMEOUT)); - spin_unlock_bh(&amt->lock); } static void amt_discovery_work(struct work_struct *work) @@ -966,7 +952,6 @@ static void amt_event_send_request(struct amt_dev *amt) { u32 exp; - spin_lock_bh(&amt->lock); if (amt->status < AMT_STATUS_RECEIVED_ADVERTISEMENT) goto out; @@ -976,21 +961,18 @@ static void amt_event_send_request(struct amt_dev *amt) amt->ready4 = false; amt->ready6 = false; amt->remote_ip = 0; - __amt_update_gw_status(amt, AMT_STATUS_INIT, false); + amt_update_gw_status(amt, AMT_STATUS_INIT, false); amt->req_cnt = 0; goto out; } - spin_unlock_bh(&amt->lock); amt_send_request(amt, false); amt_send_request(amt, true); - spin_lock_bh(&amt->lock); - __amt_update_gw_status(amt, AMT_STATUS_SENT_REQUEST, true); + amt_update_gw_status(amt, AMT_STATUS_SENT_REQUEST, true); amt->req_cnt++; out: exp = min_t(u32, (1 * (1 << amt->req_cnt)), AMT_MAX_REQ_TIMEOUT); mod_delayed_work(amt_wq, &amt->req_wq, msecs_to_jiffies(exp * 1000)); - spin_unlock_bh(&amt->lock); } static void amt_req_work(struct work_struct *work) @@ -2386,12 +2368,10 @@ static bool amt_membership_query_handler(struct amt_dev *amt, ihv3 = skb_pull(skb, sizeof(*iph) + AMT_IPHDR_OPTS); skb_reset_transport_header(skb); skb_push(skb, sizeof(*iph) + AMT_IPHDR_OPTS); - spin_lock_bh(&amt->lock); amt->ready4 = true; amt->mac = amtmq->response_mac; amt->req_cnt = 0; amt->qi = ihv3->qqic; - spin_unlock_bh(&amt->lock); skb->protocol = htons(ETH_P_IP); eth->h_proto = htons(ETH_P_IP); ip_eth_mc_map(iph->daddr, eth->h_dest); @@ -2411,12 +2391,10 @@ static bool amt_membership_query_handler(struct amt_dev *amt, mld2q = skb_pull(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS); skb_reset_transport_header(skb); skb_push(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS); - spin_lock_bh(&amt->lock); amt->ready6 = true; amt->mac = amtmq->response_mac; amt->req_cnt = 0; amt->qi = mld2q->mld2q_qqic; - spin_unlock_bh(&amt->lock); skb->protocol = htons(ETH_P_IPV6); eth->h_proto = htons(ETH_P_IPV6); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); From 928f353cb8672f0d6078aad75eeec0ed33875b12 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 17 Jul 2022 16:09:05 +0000 Subject: [PATCH 213/248] amt: use READ_ONCE() in amt module There are some data races in the amt module. amt->ready4, amt->ready6, and amt->status can be accessed concurrently without locks. So, it uses READ_ONCE() and WRITE_ONCE(). Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Paolo Abeni --- drivers/net/amt.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 1f41ba526f3e..4c5c74d79c81 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -584,7 +584,7 @@ static void amt_update_gw_status(struct amt_dev *amt, enum amt_status status, return; netdev_dbg(amt->dev, "Update GW status %s -> %s", status_str[amt->status], status_str[status]); - amt->status = status; + WRITE_ONCE(amt->status, status); } static void __amt_update_relay_status(struct amt_tunnel_list *tunnel, @@ -958,8 +958,8 @@ static void amt_event_send_request(struct amt_dev *amt) if (amt->req_cnt > AMT_MAX_REQ_COUNT) { netdev_dbg(amt->dev, "Gateway is not ready"); amt->qi = AMT_INIT_REQ_TIMEOUT; - amt->ready4 = false; - amt->ready6 = false; + WRITE_ONCE(amt->ready4, false); + WRITE_ONCE(amt->ready6, false); amt->remote_ip = 0; amt_update_gw_status(amt, AMT_STATUS_INIT, false); amt->req_cnt = 0; @@ -1239,7 +1239,8 @@ static netdev_tx_t amt_dev_xmit(struct sk_buff *skb, struct net_device *dev) /* Gateway only passes IGMP/MLD packets */ if (!report) goto free; - if ((!v6 && !amt->ready4) || (v6 && !amt->ready6)) + if ((!v6 && !READ_ONCE(amt->ready4)) || + (v6 && !READ_ONCE(amt->ready6))) goto free; if (amt_send_membership_update(amt, skb, v6)) goto free; @@ -2368,7 +2369,7 @@ static bool amt_membership_query_handler(struct amt_dev *amt, ihv3 = skb_pull(skb, sizeof(*iph) + AMT_IPHDR_OPTS); skb_reset_transport_header(skb); skb_push(skb, sizeof(*iph) + AMT_IPHDR_OPTS); - amt->ready4 = true; + WRITE_ONCE(amt->ready4, true); amt->mac = amtmq->response_mac; amt->req_cnt = 0; amt->qi = ihv3->qqic; @@ -2391,7 +2392,7 @@ static bool amt_membership_query_handler(struct amt_dev *amt, mld2q = skb_pull(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS); skb_reset_transport_header(skb); skb_push(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS); - amt->ready6 = true; + WRITE_ONCE(amt->ready6, true); amt->mac = amtmq->response_mac; amt->req_cnt = 0; amt->qi = mld2q->mld2q_qqic; @@ -2898,7 +2899,7 @@ static int amt_err_lookup(struct sock *sk, struct sk_buff *skb) break; case AMT_MSG_REQUEST: case AMT_MSG_MEMBERSHIP_UPDATE: - if (amt->status >= AMT_STATUS_RECEIVED_ADVERTISEMENT) + if (READ_ONCE(amt->status) >= AMT_STATUS_RECEIVED_ADVERTISEMENT) mod_delayed_work(amt_wq, &amt->req_wq, 0); break; default: From 627f16931bf3cb20d50274d9341380ac2c3035fd Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 17 Jul 2022 16:09:06 +0000 Subject: [PATCH 214/248] amt: add missing regeneration nonce logic in request logic When AMT gateway starts sending a new request message, it should regenerate the nonce variable. Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Paolo Abeni --- drivers/net/amt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 4c5c74d79c81..e0743c20b780 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -963,9 +963,13 @@ static void amt_event_send_request(struct amt_dev *amt) amt->remote_ip = 0; amt_update_gw_status(amt, AMT_STATUS_INIT, false); amt->req_cnt = 0; + amt->nonce = 0; goto out; } + if (!amt->req_cnt) + get_random_bytes(&amt->nonce, sizeof(__be32)); + amt_send_request(amt, false); amt_send_request(amt, true); amt_update_gw_status(amt, AMT_STATUS_SENT_REQUEST, true); From 40185f359fbabaa61da754cc29d12f3a41e0a987 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 17 Jul 2022 16:09:07 +0000 Subject: [PATCH 215/248] amt: drop unexpected advertisement message AMT gateway interface should not receive unexpected advertisement messages. In order to drop these packets, it should check nonce and amt->status. Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Paolo Abeni --- drivers/net/amt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index e0743c20b780..2d007c5c5463 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -2260,6 +2260,10 @@ static bool amt_advertisement_handler(struct amt_dev *amt, struct sk_buff *skb) ipv4_is_zeronet(amta->ip4)) return true; + if (amt->status != AMT_STATUS_SENT_DISCOVERY || + amt->nonce != amta->nonce) + return true; + amt->remote_ip = amta->ip4; netdev_dbg(amt->dev, "advertised remote ip = %pI4\n", &amt->remote_ip); mod_delayed_work(amt_wq, &amt->req_wq, 0); @@ -2975,6 +2979,7 @@ static int amt_dev_open(struct net_device *dev) amt->req_cnt = 0; amt->remote_ip = 0; + amt->nonce = 0; get_random_bytes(&amt->key, sizeof(siphash_key_t)); amt->status = AMT_STATUS_INIT; From 239d886601e38d948a28f3b2a1c9ce5f01bf75f2 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 17 Jul 2022 16:09:08 +0000 Subject: [PATCH 216/248] amt: drop unexpected query message AMT gateway interface should not receive unexpected query messages. In order to drop unexpected query messages, it checks nonce. And it also checks ready4 and ready6 variables to drop duplicated messages. Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Paolo Abeni --- drivers/net/amt.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 2d007c5c5463..8dc7b8f258c8 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -967,8 +967,11 @@ static void amt_event_send_request(struct amt_dev *amt) goto out; } - if (!amt->req_cnt) + if (!amt->req_cnt) { + WRITE_ONCE(amt->ready4, false); + WRITE_ONCE(amt->ready6, false); get_random_bytes(&amt->nonce, sizeof(__be32)); + } amt_send_request(amt, false); amt_send_request(amt, true); @@ -2353,6 +2356,9 @@ static bool amt_membership_query_handler(struct amt_dev *amt, if (amtmq->reserved || amtmq->version) return true; + if (amtmq->nonce != amt->nonce) + return true; + hdr_size -= sizeof(*eth); if (iptunnel_pull_header(skb, hdr_size, htons(ETH_P_TEB), false)) return true; @@ -2367,6 +2373,9 @@ static bool amt_membership_query_handler(struct amt_dev *amt, iph = ip_hdr(skb); if (iph->version == 4) { + if (READ_ONCE(amt->ready4)) + return true; + if (!pskb_may_pull(skb, sizeof(*iph) + AMT_IPHDR_OPTS + sizeof(*ihv3))) return true; @@ -2389,6 +2398,9 @@ static bool amt_membership_query_handler(struct amt_dev *amt, struct mld2_query *mld2q; struct ipv6hdr *ip6h; + if (READ_ONCE(amt->ready6)) + return true; + if (!pskb_may_pull(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS + sizeof(*mld2q))) return true; From e882827d5b8942a27b4d28548aa27562a3a7e94c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 17 Jul 2022 16:09:09 +0000 Subject: [PATCH 217/248] amt: drop unexpected multicast data AMT gateway interface should not receive unexpected multicast data. Multicast data message type should be received after sending an update message, which means all establishment between gateway and relay is finished. So, amt_multicast_data_handler() checks amt->status. Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Paolo Abeni --- drivers/net/amt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 8dc7b8f258c8..051e92ed56c0 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -2282,6 +2282,9 @@ static bool amt_multicast_data_handler(struct amt_dev *amt, struct sk_buff *skb) struct ethhdr *eth; struct iphdr *iph; + if (READ_ONCE(amt->status) != AMT_STATUS_SENT_UPDATE) + return true; + hdr_size = sizeof(*amtmd) + sizeof(struct udphdr); if (!pskb_may_pull(skb, hdr_size)) return true; From 989918482bbccbbce3ba2bb9156eb4c193319983 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 17 Jul 2022 16:09:10 +0000 Subject: [PATCH 218/248] amt: do not use amt->nr_tunnels outside of lock amt->nr_tunnels is protected by amt->lock. But, amt_request_handler() has been using this variable without the amt->lock. So, it expands context of amt->lock in the amt_request_handler() to protect amt->nr_tunnels variable. Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Paolo Abeni --- drivers/net/amt.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 051e92ed56c0..e019526e1df6 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -2679,7 +2679,9 @@ static bool amt_request_handler(struct amt_dev *amt, struct sk_buff *skb) if (tunnel->ip4 == iph->saddr) goto send; + spin_lock_bh(&amt->lock); if (amt->nr_tunnels >= amt->max_tunnels) { + spin_unlock_bh(&amt->lock); icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); return true; } @@ -2687,8 +2689,10 @@ static bool amt_request_handler(struct amt_dev *amt, struct sk_buff *skb) tunnel = kzalloc(sizeof(*tunnel) + (sizeof(struct hlist_head) * amt->hash_buckets), GFP_ATOMIC); - if (!tunnel) + if (!tunnel) { + spin_unlock_bh(&amt->lock); return true; + } tunnel->source_port = udph->source; tunnel->ip4 = iph->saddr; @@ -2701,10 +2705,9 @@ static bool amt_request_handler(struct amt_dev *amt, struct sk_buff *skb) INIT_DELAYED_WORK(&tunnel->gc_wq, amt_tunnel_expire); - spin_lock_bh(&amt->lock); list_add_tail_rcu(&tunnel->list, &amt->tunnel_list); tunnel->key = amt->key; - amt_update_relay_status(tunnel, AMT_STATUS_RECEIVED_REQUEST, true); + __amt_update_relay_status(tunnel, AMT_STATUS_RECEIVED_REQUEST, true); amt->nr_tunnels++; mod_delayed_work(amt_wq, &tunnel->gc_wq, msecs_to_jiffies(amt_gmi(amt))); From cdf0b86b250fd3c1c3e120c86583ea510c52e4ce Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Mon, 18 Jul 2022 16:21:20 +0800 Subject: [PATCH 219/248] r8152: fix a WOL issue This fixes that the platform is waked by an unexpected packet. The size and range of FIFO is different when the device enters S3 state, so it is necessary to correct some settings when suspending. Regardless of jumbo frame, set RMS to 1522 and MTPS to MTPS_DEFAULT. Besides, enable MCU_BORW_EN to update the method of calculating the pointer of data. Then, the hardware could get the correct data. Fixes: 195aae321c82 ("r8152: support new chips") Signed-off-by: Hayes Wang Link: https://lore.kernel.org/r/20220718082120.10957-391-nic_swsd@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index b082819509e1..0f6efaabaa32 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -32,7 +32,7 @@ #define NETNEXT_VERSION "12" /* Information for net */ -#define NET_VERSION "12" +#define NET_VERSION "13" #define DRIVER_VERSION "v1." NETNEXT_VERSION "." NET_VERSION #define DRIVER_AUTHOR "Realtek linux nic maintainers " @@ -5917,7 +5917,8 @@ static void r8153_enter_oob(struct r8152 *tp) wait_oob_link_list_ready(tp); - ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, mtu_to_size(tp->netdev->mtu)); + ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, 1522); + ocp_write_byte(tp, MCU_TYPE_PLA, PLA_MTPS, MTPS_DEFAULT); switch (tp->version) { case RTL_VER_03: @@ -5953,6 +5954,10 @@ static void r8153_enter_oob(struct r8152 *tp) ocp_data |= NOW_IS_OOB | DIS_MCU_CLROOB; ocp_write_byte(tp, MCU_TYPE_PLA, PLA_OOB_CTRL, ocp_data); + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_SFF_STS_7); + ocp_data |= MCU_BORW_EN; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_SFF_STS_7, ocp_data); + rxdy_gated_en(tp, false); ocp_data = ocp_read_dword(tp, MCU_TYPE_PLA, PLA_RCR); @@ -6555,6 +6560,9 @@ static void rtl8156_down(struct r8152 *tp) rtl_disable(tp); rtl_reset_bmu(tp); + ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, 1522); + ocp_write_byte(tp, MCU_TYPE_PLA, PLA_MTPS, MTPS_DEFAULT); + /* Clear teredo wake event. bit[15:8] is the teredo wakeup * type. Set it to zero. bits[7:0] are the W1C bits about * the events. Set them to all 1 to clear them. @@ -6565,6 +6573,10 @@ static void rtl8156_down(struct r8152 *tp) ocp_data |= NOW_IS_OOB; ocp_write_byte(tp, MCU_TYPE_PLA, PLA_OOB_CTRL, ocp_data); + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_SFF_STS_7); + ocp_data |= MCU_BORW_EN; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_SFF_STS_7, ocp_data); + rtl_rx_vlan_en(tp, true); rxdy_gated_en(tp, false); From 53eb9b04560cc368b7874a7ef1ca7666741739e4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 18 Jul 2022 11:51:53 +0200 Subject: [PATCH 220/248] net: ethernet: mtk_ppe: fix possible NULL pointer dereference in mtk_flow_get_wdma_info odev pointer can be NULL in mtk_flow_offload_replace routine according to the flower action rules. Fix possible NULL pointer dereference in mtk_flow_get_wdma_info. Fixes: a333215e10cb5 ("net: ethernet: mtk_eth_soc: implement flow offloading to WED devices") Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/4e1685bc4976e21e364055f6bee86261f8f9ee93.1658137753.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_ppe_offload.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c index 90e7dfd011c9..5d457bc9acc1 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c @@ -93,6 +93,9 @@ mtk_flow_get_wdma_info(struct net_device *dev, const u8 *addr, struct mtk_wdma_i }; struct net_device_path path = {}; + if (!ctx.dev) + return -ENODEV; + memcpy(ctx.daddr, addr, sizeof(ctx.daddr)); if (!IS_ENABLED(CONFIG_NET_MEDIATEK_SOC_WED)) From c6b10de537b904fb70522d8cc4600c2f11246c93 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Jul 2022 13:56:59 -0400 Subject: [PATCH 221/248] Documentation: fix udp_wmem_min in ip-sysctl.rst UDP doesn't support tx memory accounting, and sysctl udp_wmem_min is not really used anywhere. So we should fix the description in ip-sysctl.rst accordingly. Fixes: 95766fff6b9a ("[UDP]: Add memory accounting.") Signed-off-by: Xin Long Link: https://lore.kernel.org/r/c880a963d9b1fb5f442ae3c9e4dfa70d45296a16.1658167019.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index b3a534ed0e7c..66c72230eaad 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1052,11 +1052,7 @@ udp_rmem_min - INTEGER Default: 4K udp_wmem_min - INTEGER - Minimal size of send buffer used by UDP sockets in moderation. - Each UDP socket is able to use the size for sending data, even if - total pages of UDP sockets exceed udp_mem pressure. The unit is byte. - - Default: 4K + UDP does not have tx memory accounting and this tunable has no effect. RAW variables ============= From db87c005b9cce0b815b2268963502c178a1e27c8 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 5 Jul 2022 21:30:38 +0200 Subject: [PATCH 222/248] can: mcp251xfd: fix detection of mcp251863 In commit c6f2a617a0a8 ("can: mcp251xfd: add support for mcp251863") support for the mcp251863 was added. However it was not taken into account that the auto detection of the chip model cannot distinguish between mcp2518fd and mcp251863 and would lead to a warning message if the firmware specifies a mcp251863. Fix auto detection: If a mcp2518fd compatible chip is found, keep the mcp251863 if specified by firmware, use mcp2518fd instead. Link: https://lore.kernel.org/all/20220706064835.1848864-1-mkl@pengutronix.de Fixes: c6f2a617a0a8 ("can: mcp251xfd: add support for mcp251863") Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 9b47b07162fe..bc6518504fd4 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1690,8 +1690,8 @@ static int mcp251xfd_register_chip_detect(struct mcp251xfd_priv *priv) u32 osc; int err; - /* The OSC_LPMEN is only supported on MCP2518FD, so use it to - * autodetect the model. + /* The OSC_LPMEN is only supported on MCP2518FD and MCP251863, + * so use it to autodetect the model. */ err = regmap_update_bits(priv->map_reg, MCP251XFD_REG_OSC, MCP251XFD_REG_OSC_LPMEN, @@ -1703,10 +1703,18 @@ static int mcp251xfd_register_chip_detect(struct mcp251xfd_priv *priv) if (err) return err; - if (osc & MCP251XFD_REG_OSC_LPMEN) - devtype_data = &mcp251xfd_devtype_data_mcp2518fd; - else + if (osc & MCP251XFD_REG_OSC_LPMEN) { + /* We cannot distinguish between MCP2518FD and + * MCP251863. If firmware specifies MCP251863, keep + * it, otherwise set to MCP2518FD. + */ + if (mcp251xfd_is_251863(priv)) + devtype_data = &mcp251xfd_devtype_data_mcp251863; + else + devtype_data = &mcp251xfd_devtype_data_mcp2518fd; + } else { devtype_data = &mcp251xfd_devtype_data_mcp2517fd; + } if (!mcp251xfd_is_251XFD(priv) && priv->devtype_data.model != devtype_data->model) { From 7b66dfcc6e1e1f018492619c3d0fc432b6b54272 Mon Sep 17 00:00:00 2001 From: Liang He Date: Tue, 12 Jul 2022 17:56:23 +0800 Subject: [PATCH 223/248] can: rcar_canfd: Add missing of_node_put() in rcar_canfd_probe() We should use of_node_put() for the reference returned by of_get_child_by_name() which has increased the refcount. Fixes: 45721c406dcf ("can: rcar_canfd: Add support for r8a779a0 SoC") Link: https://lore.kernel.org/all/20220712095623.364287-1-windhl@126.com Signed-off-by: Liang He Signed-off-by: Marc Kleine-Budde --- drivers/net/can/rcar/rcar_canfd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index ba42cef10a53..cb0321ea853c 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -1843,6 +1843,7 @@ static int rcar_canfd_probe(struct platform_device *pdev) of_child = of_get_child_by_name(pdev->dev.of_node, name); if (of_child && of_device_is_available(of_child)) channels_mask |= BIT(i); + of_node_put(of_child); } if (chip_id != RENESAS_RZG2L) { From 87507bcb4f5de16bb419e9509d874f4db6c0ad0f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:39 -0700 Subject: [PATCH 224/248] ipv4: Fix a data-race around sysctl_fib_multipath_use_neigh. While reading sysctl_fib_multipath_use_neigh, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: a6db4494d218 ("net: ipv4: Consider failed nexthops in multipath routes") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d9fdcbae16ee..db7b2503f068 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2216,7 +2216,7 @@ void fib_select_multipath(struct fib_result *res, int hash) } change_nexthops(fi) { - if (net->ipv4.sysctl_fib_multipath_use_neigh) { + if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) { if (!fib_good_nh(nexthop_nh)) continue; if (!first) { From 7998c12a08c97cc26660532c9f90a34bd7d8da5a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:40 -0700 Subject: [PATCH 225/248] ipv4: Fix data-races around sysctl_fib_multipath_hash_policy. While reading sysctl_fib_multipath_hash_policy, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: bf4e0a3db97e ("net: ipv4: add support for ECMP hash policy choice") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- net/ipv4/route.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 868d28f3b4e1..de63a5f3b767 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -10324,7 +10324,7 @@ static void mlxsw_sp_mp4_hash_init(struct mlxsw_sp *mlxsw_sp, unsigned long *fields = config->fields; u32 hash_fields; - switch (net->ipv4.sysctl_fib_multipath_hash_policy) { + switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) { case 0: mlxsw_sp_mp4_hash_outer_addr(config); break; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 91c4f60de75a..521194dd1c99 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2048,7 +2048,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, struct flow_keys hash_keys; u32 mhash = 0; - switch (net->ipv4.sysctl_fib_multipath_hash_policy) { + switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; From 8895a9c2ac76fb9d3922fed4fe092c8ec5e5cccc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:41 -0700 Subject: [PATCH 226/248] ipv4: Fix data-races around sysctl_fib_multipath_hash_fields. While reading sysctl_fib_multipath_hash_fields, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: ce5c9c20d364 ("ipv4: Add a sysctl to control multipath hash fields") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- net/ipv4/route.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index de63a5f3b767..85aa1c468cd4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -10342,7 +10342,7 @@ static void mlxsw_sp_mp4_hash_init(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_mp_hash_inner_l3(config); break; case 3: - hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); /* Outer */ MLXSW_SP_MP_HASH_HEADER_SET(headers, IPV4_EN_NOT_TCP_NOT_UDP); MLXSW_SP_MP_HASH_HEADER_SET(headers, IPV4_EN_TCP_UDP); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 521194dd1c99..4702c61207a8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1929,7 +1929,7 @@ static u32 fib_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { - u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) @@ -1958,7 +1958,7 @@ static u32 fib_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { - u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was @@ -2018,7 +2018,7 @@ static u32 fib_multipath_custom_hash_skb(const struct net *net, static u32 fib_multipath_custom_hash_fl4(const struct net *net, const struct flowi4 *fl4) { - u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) From 9b55c20f83369dd54541d9ddbe3a018a8377f451 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:42 -0700 Subject: [PATCH 227/248] ip: Fix data-races around sysctl_ip_prot_sock. sysctl_ip_prot_sock is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. Fixes: 4548b683b781 ("Introduce a sysctl that modifies the value of PROT_SOCK.") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 4a15b6bcb4b8..1c979fd1904c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -357,7 +357,7 @@ static inline bool sysctl_dev_name_is_allowed(const char *name) static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port) { - return port < net->ipv4.sysctl_ip_prot_sock; + return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock); } #else diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 130e9c130311..5490c285668b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -84,7 +84,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, * port limit. */ if ((range[1] < range[0]) || - (range[0] < net->ipv4.sysctl_ip_prot_sock)) + (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock))) ret = -EINVAL; else set_local_port_range(net, range); @@ -110,7 +110,7 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write, .extra2 = &ip_privileged_port_max, }; - pports = net->ipv4.sysctl_ip_prot_sock; + pports = READ_ONCE(net->ipv4.sysctl_ip_prot_sock); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); @@ -122,7 +122,7 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write, if (range[0] < pports) ret = -EINVAL; else - net->ipv4.sysctl_ip_prot_sock = pports; + WRITE_ONCE(net->ipv4.sysctl_ip_prot_sock, pports); } return ret; From 3d72bb4188c708bb16758c60822fc4dda7a95174 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:43 -0700 Subject: [PATCH 228/248] udp: Fix a data-race around sysctl_udp_l3mdev_accept. While reading sysctl_udp_l3mdev_accept, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 63a6fff353d0 ("net: Avoid receiving packets with an l3mdev on unbound UDP sockets") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/udp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/udp.h b/include/net/udp.h index bb4c227299cc..8dd4aa1485a6 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -238,7 +238,7 @@ static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - return inet_bound_dev_eq(!!net->ipv4.sysctl_udp_l3mdev_accept, + return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_udp_l3mdev_accept), bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); From 3666f666e99600518ab20982af04a078bbdad277 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:44 -0700 Subject: [PATCH 229/248] tcp: Fix data-races around sysctl knobs related to SYN option. While reading these knobs, they can be changed concurrently. Thus, we need to add READ_ONCE() to their readers. - tcp_sack - tcp_window_scaling - tcp_timestamps Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- .../ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 6 +++--- net/core/secure_seq.c | 4 ++-- net/ipv4/syncookies.c | 6 +++--- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_output.c | 10 +++++----- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 7c760aa65540..ddfe9208529a 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1236,8 +1236,8 @@ static struct sock *chtls_recv_sock(struct sock *lsk, csk->sndbuf = newsk->sk_sndbuf; csk->smac_idx = ((struct port_info *)netdev_priv(ndev))->smt_idx; RCV_WSCALE(tp) = select_rcv_wscale(tcp_full_space(newsk), - sock_net(newsk)-> - ipv4.sysctl_tcp_window_scaling, + READ_ONCE(sock_net(newsk)-> + ipv4.sysctl_tcp_window_scaling), tp->window_clamp); neigh_release(n); inet_inherit_port(&tcp_hashinfo, lsk, newsk); @@ -1384,7 +1384,7 @@ static void chtls_pass_accept_request(struct sock *sk, #endif } if (req->tcpopt.wsf <= 14 && - sock_net(sk)->ipv4.sysctl_tcp_window_scaling) { + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) { inet_rsk(oreq)->wscale_ok = 1; inet_rsk(oreq)->snd_wscale = req->tcpopt.wsf; } diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 5f85e01d4093..b0ff6153be62 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -64,7 +64,7 @@ u32 secure_tcpv6_ts_off(const struct net *net, .daddr = *(struct in6_addr *)daddr, }; - if (net->ipv4.sysctl_tcp_timestamps != 1) + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) return 0; ts_secret_init(); @@ -120,7 +120,7 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #ifdef CONFIG_INET u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) { - if (net->ipv4.sysctl_tcp_timestamps != 1) + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) return 0; ts_secret_init(); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 9b234b42021e..942d2dfa1115 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -247,12 +247,12 @@ bool cookie_timestamp_decode(const struct net *net, return true; } - if (!net->ipv4.sysctl_tcp_timestamps) + if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return false; tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; - if (tcp_opt->sack_ok && !net->ipv4.sysctl_tcp_sack) + if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return false; if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) @@ -261,7 +261,7 @@ bool cookie_timestamp_decode(const struct net *net, tcp_opt->wscale_ok = 1; tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; - return net->ipv4.sysctl_tcp_window_scaling != 0; + return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; } EXPORT_SYMBOL(cookie_timestamp_decode); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d451248bebec..92626e15115c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4060,7 +4060,7 @@ void tcp_parse_options(const struct net *net, break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW && th->syn && - !estab && net->ipv4.sysctl_tcp_window_scaling) { + !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > TCP_MAX_WSCALE) { @@ -4076,7 +4076,7 @@ void tcp_parse_options(const struct net *net, case TCPOPT_TIMESTAMP: if ((opsize == TCPOLEN_TIMESTAMP) && ((estab && opt_rx->tstamp_ok) || - (!estab && net->ipv4.sysctl_tcp_timestamps))) { + (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) { opt_rx->saw_tstamp = 1; opt_rx->rcv_tsval = get_unaligned_be32(ptr); opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); @@ -4084,7 +4084,7 @@ void tcp_parse_options(const struct net *net, break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM && th->syn && - !estab && net->ipv4.sysctl_tcp_sack) { + !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) { opt_rx->sack_ok = TCP_SACK_SEEN; tcp_sack_reset(opt_rx); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3b3552d292a5..38a71e711edc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -791,18 +791,18 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; - if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) { + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } - if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) { + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } - if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) { + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; @@ -3647,7 +3647,7 @@ static void tcp_connect_init(struct sock *sk) * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr); - if (sock_net(sk)->ipv4.sysctl_tcp_timestamps) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG @@ -3683,7 +3683,7 @@ static void tcp_connect_init(struct sock *sk) tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, - sock_net(sk)->ipv4.sysctl_tcp_window_scaling, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), &rcv_wscale, rcv_wnd); From 52e65865deb6a36718a463030500f16530eaab74 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:45 -0700 Subject: [PATCH 230/248] tcp: Fix a data-race around sysctl_tcp_early_retrans. While reading sysctl_tcp_early_retrans, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: eed530b6c676 ("tcp: early retransmit") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 38a71e711edc..898fcdcb7989 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2741,7 +2741,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rcu_access_pointer(tp->fastopen_rsk)) return false; - early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; + early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans); /* Schedule a loss probe in 2*RTT for SACK capable connections * not in loss recovery, that are either limited by cwnd or application. */ From e7d2ef837e14a971a05f60ea08c47f3fed1a36e4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:46 -0700 Subject: [PATCH 231/248] tcp: Fix data-races around sysctl_tcp_recovery. While reading sysctl_tcp_recovery, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 4f41b1c58a32 ("tcp: use RACK to detect losses") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- net/ipv4/tcp_recovery.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 92626e15115c..36eabd109e7c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2095,7 +2095,8 @@ static inline void tcp_init_undo(struct tcp_sock *tp) static bool tcp_is_rack(const struct sock *sk) { - return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; + return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_LOSS_DETECTION; } /* If we detect SACK reneging, forget all SACK information diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 48f30e7209f2..50abaa941387 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -14,7 +14,8 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk) return 0; if (tp->sacked_out >= tp->reordering && - !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH)) + !(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_NO_DUPTHRESH)) return 0; } @@ -187,7 +188,8 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); - if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND || + if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_STATIC_REO_WND) || !rs->prior_delivered) return; From 7c6f2a86ca590d5187a073d987e9599985fb1c7c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:47 -0700 Subject: [PATCH 232/248] tcp: Fix a data-race around sysctl_tcp_thin_linear_timeouts. While reading sysctl_tcp_thin_linear_timeouts, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 36e31b0af587 ("net: TCP thin linear timeouts") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ec5277becc6a..50bba370486e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -578,7 +578,7 @@ out_reset_timer: * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && - (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) && + (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; From 4845b5713ab18a1bb6e31d1fbb4d600240b8b691 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:48 -0700 Subject: [PATCH 233/248] tcp: Fix data-races around sysctl_tcp_slow_start_after_idle. While reading sysctl_tcp_slow_start_after_idle, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 35089bb203f4 ("[TCP]: Add tcp_slow_start_after_idle sysctl.") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++-- net/ipv4/tcp_output.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 6ee1fb4fb292..071735e10872 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1403,8 +1403,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); s32 delta; - if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out || - ca_ops->cong_control) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) || + tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 898fcdcb7989..51120407c570 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1898,7 +1898,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); From 1a63cb91f0c2fcdeced6d6edee8d1d886583d139 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:49 -0700 Subject: [PATCH 234/248] tcp: Fix a data-race around sysctl_tcp_retrans_collapse. While reading sysctl_tcp_retrans_collapse, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 51120407c570..c38e07b50639 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3105,7 +3105,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, struct sk_buff *skb = to, *tmp; bool first = true; - if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; From 4e08ed41cb1194009fc1a916a59ce3ed4afd77cd Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:50 -0700 Subject: [PATCH 235/248] tcp: Fix a data-race around sysctl_tcp_stdurg. While reading sysctl_tcp_stdurg, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 36eabd109e7c..31a9d2b8ecdc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5572,7 +5572,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); - if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) + if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg)) ptr--; ptr += ntohl(th->seq); From 0b484c91911e758e53656d570de58c2ed81ec6f2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:51 -0700 Subject: [PATCH 236/248] tcp: Fix a data-race around sysctl_tcp_rfc1337. While reading sysctl_tcp_rfc1337, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6854bb1fb32b..700ea548d125 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -173,7 +173,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) { + if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; From 2d17d9c7382327d00aeaea35af44e9b26d53206e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:52 -0700 Subject: [PATCH 237/248] tcp: Fix a data-race around sysctl_tcp_abort_on_overflow. While reading sysctl_tcp_abort_on_overflow, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 700ea548d125..cb95d88497ae 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -781,7 +781,7 @@ listen_overflow: if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); - if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { inet_rsk(req)->acked = 1; return NULL; } From a11e5b3e7a59fde1a90b0eaeaa82320495cf8cae Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jul 2022 10:26:53 -0700 Subject: [PATCH 238/248] tcp: Fix data-races around sysctl_tcp_max_reordering. While reading sysctl_tcp_max_reordering, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: dca145ffaa8d ("tcp: allow for bigger reordering level") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 31a9d2b8ecdc..07dbcbae7782 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1051,7 +1051,7 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, tp->undo_marker ? tp->undo_retrans : 0); #endif tp->reordering = min_t(u32, (metric + mss - 1) / mss, - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); } /* This exciting event is worth to be remembered. 8) */ @@ -2030,7 +2030,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) return; tp->reordering = min_t(u32, tp->packets_out + addend, - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); tp->reord_seen++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } From c0f47c2822aadeb8b2829f3e4c3792f184c7be33 Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Tue, 19 Jul 2022 15:24:09 +0300 Subject: [PATCH 239/248] net/sched: cls_api: Fix flow action initialization The cited commit refactored the flow action initialization sequence to use an interface method when translating tc action instances to flow offload objects. The refactored version skips the initialization of the generic flow action attributes for tc actions, such as pedit, that allocate more than one offload entry. This can cause potential issues for drivers mapping flow action ids. Populate the generic flow action fields for all the flow action entries. Fixes: c54e1d920f04 ("flow_offload: add ops to tc_action_ops for flow action setup") Signed-off-by: Oz Shlomo Reviewed-by: Roi Dayan ---- v1 -> v2: - coalese the generic flow action fields initialization to a single loop Reviewed-by: Baowen Zheng Signed-off-by: David S. Miller --- net/sched/cls_api.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 9bb4d3dcc994..ac366c99086f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3533,7 +3533,7 @@ int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], struct netlink_ext_ack *extack) { - int i, j, index, err = 0; + int i, j, k, index, err = 0; struct tc_action *act; BUILD_BUG_ON(TCA_ACT_HW_STATS_ANY != FLOW_ACTION_HW_STATS_ANY); @@ -3553,14 +3553,18 @@ int tc_setup_action(struct flow_action *flow_action, if (err) goto err_out_locked; - entry->hw_stats = tc_act_hw_stats(act->hw_stats); - entry->hw_index = act->tcfa_index; index = 0; err = tc_setup_offload_act(act, entry, &index, extack); - if (!err) - j += index; - else + if (err) goto err_out_locked; + + for (k = 0; k < index ; k++) { + entry[k].hw_stats = tc_act_hw_stats(act->hw_stats); + entry[k].hw_index = act->tcfa_index; + } + + j += index; + spin_unlock_bh(&act->tcfa_lock); } From e5ec6a2513383fe2ecc2ee3b5f51d97acbbcd4d8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 19 Jul 2022 15:26:26 +0300 Subject: [PATCH 240/248] mlxsw: spectrum_router: Fix IPv4 nexthop gateway indication mlxsw needs to distinguish nexthops with a gateway from connected nexthops in order to write the former to the adjacency table of the device. The check used to rely on the fact that nexthops with a gateway have a 'link' scope whereas connected nexthops have a 'host' scope. This is no longer correct after commit 747c14307214 ("ip: fix dflt addr selection for connected nexthop"). Fix that by instead checking the address family of the gateway IP. This is a more direct way and also consistent with the IPv6 counterpart in mlxsw_sp_rt6_is_gateway(). Cc: stable@vger.kernel.org Fixes: 747c14307214 ("ip: fix dflt addr selection for connected nexthop") Fixes: 597cfe4fc339 ("nexthop: Add support for IPv4 nexthops") Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Reviewed-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 85aa1c468cd4..ce33dbde124d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5384,7 +5384,7 @@ static bool mlxsw_sp_fi_is_gateway(const struct mlxsw_sp *mlxsw_sp, { const struct fib_nh *nh = fib_info_nh(fi, 0); - return nh->fib_nh_scope == RT_SCOPE_LINK || + return nh->fib_nh_gw_family || mlxsw_sp_nexthop4_ipip_type(mlxsw_sp, nh, NULL); } From 543ce63b664e2c2f9533d089a4664b559c3e6b5b Mon Sep 17 00:00:00 2001 From: Eric Snowberg Date: Wed, 20 Jul 2022 12:40:27 -0400 Subject: [PATCH 241/248] lockdown: Fix kexec lockdown bypass with ima policy The lockdown LSM is primarily used in conjunction with UEFI Secure Boot. This LSM may also be used on machines without UEFI. It can also be enabled when UEFI Secure Boot is disabled. One of lockdown's features is to prevent kexec from loading untrusted kernels. Lockdown can be enabled through a bootparam or after the kernel has booted through securityfs. If IMA appraisal is used with the "ima_appraise=log" boot param, lockdown can be defeated with kexec on any machine when Secure Boot is disabled or unavailable. IMA prevents setting "ima_appraise=log" from the boot param when Secure Boot is enabled, but this does not cover cases where lockdown is used without Secure Boot. To defeat lockdown, boot without Secure Boot and add ima_appraise=log to the kernel command line; then: $ echo "integrity" > /sys/kernel/security/lockdown $ echo "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig" > \ /sys/kernel/security/ima/policy $ kexec -ls unsigned-kernel Add a call to verify ima appraisal is set to "enforce" whenever lockdown is enabled. This fixes CVE-2022-21505. Cc: stable@vger.kernel.org Fixes: 29d3c1c8dfe7 ("kexec: Allow kexec_file() with appropriate IMA policy when locked down") Signed-off-by: Eric Snowberg Acked-by: Mimi Zohar Reviewed-by: John Haxby Signed-off-by: Linus Torvalds --- security/integrity/ima/ima_policy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 73917413365b..a8802b8da946 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -2247,6 +2247,10 @@ bool ima_appraise_signature(enum kernel_read_file_id id) if (id >= READING_MAX_ID) return false; + if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE) + && security_locked_down(LOCKDOWN_KEXEC)) + return false; + func = read_idmap[id] ?: FILE_CHECK; rcu_read_lock(); From 353f7988dd8413c47718f7ca79c030b6fb62cfe5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 19 Jul 2022 11:09:01 -0700 Subject: [PATCH 242/248] watchqueue: make sure to serialize 'wqueue->defunct' properly When the pipe is closed, we mark the associated watchqueue defunct by calling watch_queue_clear(). However, while that is protected by the watchqueue lock, new watchqueue entries aren't actually added under that lock at all: they use the pipe->rd_wait.lock instead, and looking up that pipe happens without any locking. The watchqueue code uses the RCU read-side section to make sure that the wqueue entry itself hasn't disappeared, but that does not protect the pipe_info in any way. So make sure to actually hold the wqueue lock when posting watch events, properly serializing against the pipe being torn down. Reported-by: Noam Rathaus Cc: Greg KH Cc: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 53 +++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 230038d4f908..8b28fad1319b 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -34,6 +34,27 @@ MODULE_LICENSE("GPL"); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) +/* + * This must be called under the RCU read-lock, which makes + * sure that the wqueue still exists. It can then take the lock, + * and check that the wqueue hasn't been destroyed, which in + * turn makes sure that the notification pipe still exists. + */ +static inline bool lock_wqueue(struct watch_queue *wqueue) +{ + spin_lock_bh(&wqueue->lock); + if (unlikely(wqueue->defunct)) { + spin_unlock_bh(&wqueue->lock); + return false; + } + return true; +} + +static inline void unlock_wqueue(struct watch_queue *wqueue) +{ + spin_unlock_bh(&wqueue->lock); +} + static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -69,6 +90,10 @@ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { /* * Post a notification to a watch queue. + * + * Must be called with the RCU lock for reading, and the + * watch_queue lock held, which guarantees that the pipe + * hasn't been released. */ static bool post_one_notification(struct watch_queue *wqueue, struct watch_notification *n) @@ -85,9 +110,6 @@ static bool post_one_notification(struct watch_queue *wqueue, spin_lock_irq(&pipe->rd_wait.lock); - if (wqueue->defunct) - goto out; - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; @@ -203,7 +225,10 @@ void __post_watch_notification(struct watch_list *wlist, if (security_post_notification(watch->cred, cred, n) < 0) continue; - post_one_notification(wqueue, n); + if (lock_wqueue(wqueue)) { + post_one_notification(wqueue, n); + unlock_wqueue(wqueue);; + } } rcu_read_unlock(); @@ -462,11 +487,12 @@ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) return -EAGAIN; } - spin_lock_bh(&wqueue->lock); - kref_get(&wqueue->usage); - kref_get(&watch->usage); - hlist_add_head(&watch->queue_node, &wqueue->watches); - spin_unlock_bh(&wqueue->lock); + if (lock_wqueue(wqueue)) { + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + unlock_wqueue(wqueue); + } hlist_add_head(&watch->list_node, &wlist->watchers); return 0; @@ -520,20 +546,15 @@ found: wqueue = rcu_dereference(watch->queue); - /* We don't need the watch list lock for the next bit as RCU is - * protecting *wqueue from deallocation. - */ - if (wqueue) { + if (lock_wqueue(wqueue)) { post_one_notification(wqueue, &n.watch); - spin_lock_bh(&wqueue->lock); - if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node); put_watch(watch); } - spin_unlock_bh(&wqueue->lock); + unlock_wqueue(wqueue); } if (wlist->release_watch) { From 44e29e64cf1ac0cffb152e0532227ea6d002aa28 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 21 Jul 2022 10:30:14 -0700 Subject: [PATCH 243/248] watch-queue: remove spurious double semicolon Sedat Dilek noticed that I had an extraneous semicolon at the end of a line in the previous patch. It's harmless, but unintentional, and while compilers just treat it as an extra empty statement, for all I know some other tooling might warn about it. So clean it up before other people notice too ;) Fixes: 353f7988dd84 ("watchqueue: make sure to serialize 'wqueue->defunct' properly") Reported-by: Sedat Dilek Signed-off-by: Linus Torvalds Reported-by: Sedat Dilek --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 8b28fad1319b..bb9962b33f95 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -227,7 +227,7 @@ void __post_watch_notification(struct watch_list *wlist, if (lock_wqueue(wqueue)) { post_one_notification(wqueue, n); - unlock_wqueue(wqueue);; + unlock_wqueue(wqueue); } } From 23a67619bc7e12e1b3776802f16084530b357a5d Mon Sep 17 00:00:00 2001 From: Khalid Masum Date: Thu, 21 Jul 2022 15:30:42 +0600 Subject: [PATCH 244/248] scripts/gdb: Fix gdb 'lx-symbols' command Currently the command 'lx-symbols' in gdb exits with the error`Function "do_init_module" not defined in "kernel/module.c"`. This occurs because the file kernel/module.c was moved to kernel/module/main.c. Fix this breakage by changing the path to "kernel/module/main.c" in LoadModuleBreakpoint. Signed-off-by: Khalid Masum Acked-by: Luis Chamberlain Fixes: cfc1d277891e ("module: Move all into module/") Reviewed-by: Douglas Anderson Signed-off-by: Linus Torvalds --- scripts/gdb/linux/symbols.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 46f7542db08c..dc07b6d12e30 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -180,7 +180,7 @@ lx-symbols command.""" self.breakpoint.delete() self.breakpoint = None self.breakpoint = LoadModuleBreakpoint( - "kernel/module.c:do_init_module", self) + "kernel/module/main.c:do_init_module", self) else: gdb.write("Note: symbol update on module loading not supported " "with this gdb version\n") From 1e9fdf21a4339b102539f476a9842e7526c01939 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jul 2022 09:18:03 +0200 Subject: [PATCH 245/248] mmu_gather: Remove per arch tlb_{start,end}_vma() Scattered across the archs are 3 basic forms of tlb_{start,end}_vma(). Provide two new MMU_GATHER_knobs to enumerate them and remove the per arch tlb_{start,end}_vma() implementations. - MMU_GATHER_NO_FLUSH_CACHE indicates the arch has flush_cache_range() but does *NOT* want to call it for each VMA. - MMU_GATHER_MERGE_VMAS indicates the arch wants to merge the invalidate across multiple VMAs if possible. With these it is possible to capture the three forms: 1) empty stubs; select MMU_GATHER_NO_FLUSH_CACHE and MMU_GATHER_MERGE_VMAS 2) start: flush_cache_range(), end: empty; select MMU_GATHER_MERGE_VMAS 3) start: flush_cache_range(), end: flush_tlb_range(); default Obviously, if the architecture does not have flush_cache_range() then it also doesn't need to select MMU_GATHER_NO_FLUSH_CACHE. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: David Miller Signed-off-by: Linus Torvalds --- arch/Kconfig | 7 +++++++ arch/csky/include/asm/tlb.h | 13 ------------- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/tlb.h | 10 ---------- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/tlb.h | 2 -- arch/s390/Kconfig | 1 + arch/s390/include/asm/tlb.h | 3 --- arch/sparc/Kconfig | 2 ++ arch/sparc/include/asm/tlb_64.h | 2 -- arch/x86/Kconfig | 1 + arch/x86/include/asm/tlb.h | 3 --- include/asm-generic/tlb.h | 21 +++++++++++++++++++-- 13 files changed, 32 insertions(+), 35 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index fcf9a41a4ef5..71b9272acb28 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -438,6 +438,13 @@ config MMU_GATHER_PAGE_SIZE config MMU_GATHER_NO_RANGE bool + select MMU_GATHER_MERGE_VMAS + +config MMU_GATHER_NO_FLUSH_CACHE + bool + +config MMU_GATHER_MERGE_VMAS + bool config MMU_GATHER_NO_GATHER bool diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h index 3498e65f59f8..750d041938d8 100644 --- a/arch/csky/include/asm/tlb.h +++ b/arch/csky/include/asm/tlb.h @@ -4,19 +4,6 @@ #define __ASM_CSKY_TLB_H #include - -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \ - } while (0) - -#define tlb_end_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \ - } while (0) - #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) #include diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 53a912befb62..b57daee98b89 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -108,6 +108,7 @@ config LOONGARCH select TRACE_IRQFLAGS_SUPPORT select USE_PERCPU_NUMA_NODE_ID select ZONE_DMA32 + select MMU_GATHER_MERGE_VMAS if MMU config 32BIT bool diff --git a/arch/loongarch/include/asm/tlb.h b/arch/loongarch/include/asm/tlb.h index 4f629ae9d5a9..dd24f5898f65 100644 --- a/arch/loongarch/include/asm/tlb.h +++ b/arch/loongarch/include/asm/tlb.h @@ -137,16 +137,6 @@ static inline void invtlb_all(u32 op, u32 info, u64 addr) ); } -/* - * LoongArch doesn't need any special per-pte or per-vma handling, except - * we need to flush cache for area to be unmapped. - */ -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) static void tlb_flush(struct mmu_gather *tlb); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7aa12e88c580..c235648fae23 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -256,6 +256,7 @@ config PPC select IRQ_FORCED_THREADING select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE select NEED_PER_CPU_EMBED_FIRST_CHUNK if PPC64 diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 09a9ae5f3656..b3de6102a907 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -19,8 +19,6 @@ #include -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry #define tlb_flush tlb_flush diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8cd9e56c629b..5a1a8dfda6f8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -204,6 +204,7 @@ config S390 select IOMMU_SUPPORT if PCI select MMU_GATHER_NO_GATHER select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PCI select NEED_SG_DMA_LENGTH if PCI diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index fe6407f0eb1b..3a5c8fb590e5 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -27,9 +27,6 @@ static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb #define pmd_free_tlb pmd_free_tlb diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index ba449c47effd..4f7d1dfbc608 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -67,6 +67,8 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select MMU_GATHER_RCU_TABLE_FREE if SMP + select MMU_GATHER_MERGE_VMAS + select MMU_GATHER_NO_FLUSH_CACHE select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h index 779a5a0f0608..3037187482db 100644 --- a/arch/sparc/include/asm/tlb_64.h +++ b/arch/sparc/include/asm/tlb_64.h @@ -22,8 +22,6 @@ void smp_flush_tlb_mm(struct mm_struct *mm); void __flush_tlb_pending(unsigned long, unsigned long, unsigned long *); void flush_tlb_pending(void); -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define tlb_flush(tlb) flush_tlb_pending() /* diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e58798f636d4..7fff10e15969 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -245,6 +245,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 1bfe979bb9bc..580636cdc257 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -2,9 +2,6 @@ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index ff3e82553a76..c1f03c1acbfc 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -158,9 +158,24 @@ * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * + * MMU_GATHER_NO_FLUSH_CACHE + * + * Indicates the architecture has flush_cache_range() but it needs *NOT* be called + * before unmapping a VMA. + * + * NOTE: strictly speaking we shouldn't have this knob and instead rely on + * flush_cache_range() being a NOP, except Sparc64 seems to be + * different here. + * + * MMU_GATHER_MERGE_VMAS + * + * Indicates the architecture wants to merge ranges over VMAs; typical when + * multiple range invalidates are more expensive than a full invalidate. + * * MMU_GATHER_NO_RANGE * - * Use this if your architecture lacks an efficient flush_tlb_range(). + * Use this if your architecture lacks an efficient flush_tlb_range(). This + * option implies MMU_GATHER_MERGE_VMAS above. * * MMU_GATHER_NO_GATHER * @@ -493,14 +508,16 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct * return; tlb_update_vma_flags(tlb, vma); +#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE flush_cache_range(vma, vma->vm_start, vma->vm_end); +#endif } #endif #ifndef tlb_end_vma static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { - if (tlb->fullmm) + if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) return; /* From 1d7708e75c49d08392884a08feeebaa3f9d80703 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jul 2022 09:18:04 +0200 Subject: [PATCH 246/248] csky/tlb: Remove tlb_flush() define The previous patch removed the tlb_flush_end() implementation which used tlb_flush_range(). This means: - csky did double invalidates, a range invalidate per vma and a full invalidate at the end - csky actually has range invalidates and as such the generic tlb_flush implementation is more efficient for it. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Tested-by: Guo Ren Signed-off-by: Linus Torvalds --- arch/csky/include/asm/tlb.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h index 750d041938d8..702861c68874 100644 --- a/arch/csky/include/asm/tlb.h +++ b/arch/csky/include/asm/tlb.h @@ -4,8 +4,6 @@ #define __ASM_CSKY_TLB_H #include -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include #endif /* __ASM_CSKY_TLB_H */ From 18ba064e42df3661e196ab58a23931fc732a420b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jul 2022 09:18:05 +0200 Subject: [PATCH 247/248] mmu_gather: Let there be one tlb_{start,end}_vma() implementation Now that architectures are no longer allowed to override tlb_{start,end}_vma() re-arrange code so that there is only one implementation for each of these functions. This much simplifies trying to figure out what they actually do. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index c1f03c1acbfc..897ca66338d5 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -349,8 +349,8 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) #ifdef CONFIG_MMU_GATHER_NO_RANGE -#if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma) -#error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma() +#if defined(tlb_flush) +#error MMU_GATHER_NO_RANGE relies on default tlb_flush() #endif /* @@ -370,17 +370,10 @@ static inline void tlb_flush(struct mmu_gather *tlb) static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } -#define tlb_end_vma tlb_end_vma -static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { } - #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush -#if defined(tlb_start_vma) || defined(tlb_end_vma) -#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma() -#endif - /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation @@ -501,7 +494,6 @@ static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb) * case where we're doing a full MM flush. When we're doing a munmap, * the vmas are adjusted to only cover the region to be torn down. */ -#ifndef tlb_start_vma static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) @@ -512,9 +504,7 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct * flush_cache_range(vma, vma->vm_start, vma->vm_end); #endif } -#endif -#ifndef tlb_end_vma static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) @@ -528,7 +518,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm */ tlb_flush_mmu_tlbonly(tlb); } -#endif /* * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, From b67fbebd4cf980aecbcc750e1462128bffe8ae15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jul 2022 09:18:06 +0200 Subject: [PATCH 248/248] mmu_gather: Force tlb-flush VM_PFNMAP vmas Jann reported a race between munmap() and unmap_mapping_range(), where unmap_mapping_range() will no-op once unmap_vmas() has unlinked the VMA; however munmap() will not yet have invalidated the TLBs. Therefore unmap_mapping_range() will complete while there are still (stale) TLB entries for the specified range. Mitigate this by force flushing TLBs for VM_PFNMAP ranges. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 897ca66338d5..cb2167c89eee 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -303,6 +303,7 @@ struct mmu_gather { */ unsigned int vma_exec : 1; unsigned int vma_huge : 1; + unsigned int vma_pfn : 1; unsigned int batch_count; @@ -373,7 +374,6 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush - /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation @@ -393,6 +393,9 @@ static inline void tlb_flush(struct mmu_gather *tlb) flush_tlb_range(&vma, tlb->start, tlb->end); } } +#endif + +#endif /* CONFIG_MMU_GATHER_NO_RANGE */ static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) @@ -410,17 +413,9 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); + tlb->vma_pfn = !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); } -#else - -static inline void -tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } - -#endif - -#endif /* CONFIG_MMU_GATHER_NO_RANGE */ - static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { /* @@ -507,16 +502,22 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct * static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { - if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) + if (tlb->fullmm) return; /* - * Do a TLB flush and reset the range at VMA boundaries; this avoids - * the ranges growing with the unused space between consecutive VMAs, - * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on - * this. + * VM_PFNMAP is more fragile because the core mm will not track the + * page mapcount -- there might not be page-frames for these PFNs after + * all. Force flush TLBs for such ranges to avoid munmap() vs + * unmap_mapping_range() races. */ - tlb_flush_mmu_tlbonly(tlb); + if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) { + /* + * Do a TLB flush and reset the range at VMA boundaries; this avoids + * the ranges growing with the unused space between consecutive VMAs. + */ + tlb_flush_mmu_tlbonly(tlb); + } } /*