powerpc/smp: Optimize update_mask_by_l2

All threads of a SMT4 core can either be part of this CPU's l2-cache mask or not related to this CPU l2-cache mask. Use this relation to reduce the number of iterations needed to find all the CPUs that share the same l2-cache. Use a temporary mask to iterate through the CPUs that may share l2_cache mask. Also instead of setting one CPU at a time into cpu_l2_cache_mask, copy the SMT4/sub mask at one shot. Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200921095653.9701-10-srikar@linux.vnet.ibm.com
2024-11-25 21:24:08 +08:00 · 2020-09-21 15:26:51 +05:30 · 2020-09-21 15:26:51 +05:30 · 3ab33d6dc3
commit 3ab33d6dc3
parent 375370a10d
1 changed files with 45 additions and 6 deletions
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@ -670,6 +670,28 @@ static void set_cpus_unrelated(int i, int j,
 }
 #endif

+/*
+ * Extends set_cpus_related. Instead of setting one CPU at a time in
+ * dstmask, set srcmask at oneshot. dstmask should be super set of srcmask.
+ */
+static void or_cpumasks_related(int i, int j, struct cpumask *(*srcmask)(int),
+				struct cpumask *(*dstmask)(int))
+{
+	struct cpumask *mask;
+	int k;
+
+	mask = srcmask(j);
+	for_each_cpu(k, srcmask(i))
+		cpumask_or(dstmask(k), dstmask(k), mask);
+
+	if (i == j)
+		return;
+
+	mask = srcmask(i);
+	for_each_cpu(k, srcmask(j))
+		cpumask_or(dstmask(k), dstmask(k), mask);
+}
+
 /*
 * parse_thread_groups: Parses the "ibm,thread-groups" device tree
 *                      property for the CPU device node @dn and stores
@ -1220,7 +1242,9 @@ static struct device_node *cpu_to_l2cache(int cpu)

 static bool update_mask_by_l2(int cpu)
 {
+	struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
 	struct device_node *l2_cache, *np;
+	cpumask_var_t mask;
 	int i;

 	l2_cache = cpu_to_l2cache(cpu);
@ -1240,22 +1264,37 @@ static bool update_mask_by_l2(int cpu)
 		return false;
 	}

-	cpumask_set_cpu(cpu, cpu_l2_cache_mask(cpu));
-	for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
+	alloc_cpumask_var_node(&mask, GFP_KERNEL, cpu_to_node(cpu));
+	cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
+
+	if (has_big_cores)
+		submask_fn = cpu_smallcore_mask;
+
+	/* Update l2-cache mask with all the CPUs that are part of submask */
+	or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
+
+	/* Skip all CPUs already part of current CPU l2-cache mask */
+	cpumask_andnot(mask, mask, cpu_l2_cache_mask(cpu));
+
+	for_each_cpu(i, mask) {
 		/*
 		 * when updating the marks the current CPU has not been marked
 		 * online, but we need to update the cache masks
 		 */
 		np = cpu_to_l2cache(i);
-		if (!np)
-			continue;

-		if (np == l2_cache)
-			set_cpus_related(cpu, i, cpu_l2_cache_mask);
+		/* Skip all CPUs already part of current CPU l2-cache */
+		if (np == l2_cache) {
+			or_cpumasks_related(cpu, i, submask_fn, cpu_l2_cache_mask);
+			cpumask_andnot(mask, mask, submask_fn(i));
+		} else {
+			cpumask_andnot(mask, mask, cpu_l2_cache_mask(i));
+		}

 		of_node_put(np);
 	}
 	of_node_put(l2_cache);
+	free_cpumask_var(mask);

 	return true;
 }