mirror of
https://gcc.gnu.org/git/gcc.git
synced 2024-11-27 13:54:19 +08:00
Runtime checking of OpenACC parallelism dimensions clauses
libgomp/ * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite. * testsuite/lib/libgomp.exp (check_effective_target_openacc_nvidia_accel_configured): New proc. * testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c) (check_effective_target_c++): New procs. * testsuite/libgomp.oacc-c/c.exp (check_effective_target_c) (check_effective_target_c++): Likewise. From-SVN: r248358
This commit is contained in:
parent
bdf58f6bf4
commit
0c36d0d571
@ -1,3 +1,14 @@
|
||||
2017-05-23 Thomas Schwinge <thomas@codesourcery.com>
|
||||
|
||||
* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
|
||||
* testsuite/lib/libgomp.exp
|
||||
(check_effective_target_openacc_nvidia_accel_configured): New
|
||||
proc.
|
||||
* testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
|
||||
(check_effective_target_c++): New procs.
|
||||
* testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
|
||||
(check_effective_target_c++): Likewise.
|
||||
|
||||
2017-05-22 Jakub Jelinek <jakub@redhat.com>
|
||||
|
||||
PR middle-end/80809
|
||||
|
@ -359,6 +359,18 @@ proc check_effective_target_offload_device_shared_as { } {
|
||||
} ]
|
||||
}
|
||||
|
||||
# Return 1 if configured for nvptx offloading.
|
||||
|
||||
proc check_effective_target_openacc_nvidia_accel_configured { } {
|
||||
global offload_targets
|
||||
if { ![string match "*,nvptx,*" ",$offload_targets,"] } {
|
||||
return 0
|
||||
}
|
||||
# PR libgomp/65099: Currently, we only support offloading in 64-bit
|
||||
# configurations.
|
||||
return [is-effective-target lp64]
|
||||
}
|
||||
|
||||
# Return 1 if at least one nvidia board is present.
|
||||
|
||||
proc check_effective_target_openacc_nvidia_accel_present { } {
|
||||
|
@ -4,6 +4,13 @@ load_lib libgomp-dg.exp
|
||||
load_gcc_lib gcc-dg.exp
|
||||
load_gcc_lib torture-options.exp
|
||||
|
||||
proc check_effective_target_c { } {
|
||||
return 0
|
||||
}
|
||||
proc check_effective_target_c++ { } {
|
||||
return 1
|
||||
}
|
||||
|
||||
global shlib_ext
|
||||
|
||||
set shlib_ext [get_shlib_extension]
|
||||
|
@ -1,25 +1,524 @@
|
||||
/* { dg-do run { target openacc_nvidia_accel_selected } } */
|
||||
/* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
|
||||
vector_length. */
|
||||
|
||||
#include <limits.h>
|
||||
#include <openacc.h>
|
||||
|
||||
/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
|
||||
not behaving as expected for -O0. */
|
||||
#pragma acc routine seq
|
||||
static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
|
||||
{
|
||||
if (acc_on_device ((int) acc_device_host))
|
||||
return 0;
|
||||
else if (acc_on_device ((int) acc_device_nvidia))
|
||||
{
|
||||
unsigned int r;
|
||||
asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
|
||||
return r;
|
||||
}
|
||||
else
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
#pragma acc routine seq
|
||||
static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
|
||||
{
|
||||
if (acc_on_device ((int) acc_device_host))
|
||||
return 0;
|
||||
else if (acc_on_device ((int) acc_device_nvidia))
|
||||
{
|
||||
unsigned int r;
|
||||
asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
|
||||
return r;
|
||||
}
|
||||
else
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
#pragma acc routine seq
|
||||
static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
|
||||
{
|
||||
if (acc_on_device ((int) acc_device_host))
|
||||
return 0;
|
||||
else if (acc_on_device ((int) acc_device_nvidia))
|
||||
{
|
||||
unsigned int r;
|
||||
asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
|
||||
return r;
|
||||
}
|
||||
else
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
/* Worker and vector size checks. Picked an outrageously large
|
||||
value. */
|
||||
|
||||
int main ()
|
||||
{
|
||||
int dummy[10];
|
||||
acc_init (acc_device_default);
|
||||
|
||||
#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_workers" } */
|
||||
/* Non-positive value. */
|
||||
|
||||
/* GR, WS, VS. */
|
||||
{
|
||||
#pragma acc loop worker
|
||||
for (int i = 0; i < 10; i++)
|
||||
dummy[i] = i;
|
||||
#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
|
||||
int gangs_actual = GANGS;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (gangs_actual) \
|
||||
reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
|
||||
num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
|
||||
{
|
||||
/* We're actually executing with num_gangs (1). */
|
||||
gangs_actual = 1;
|
||||
for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
|
||||
{
|
||||
/* <https://gcc.gnu.org/PR80547>. */
|
||||
#if 0
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
#else
|
||||
int gangs = acc_gang ();
|
||||
gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
|
||||
gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
|
||||
int workers = acc_worker ();
|
||||
workers_min = (workers_min < workers) ? workers_min : workers;
|
||||
workers_max = (workers_max > workers) ? workers_max : workers;
|
||||
int vectors = acc_vector ();
|
||||
vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
|
||||
vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if (gangs_actual != 1)
|
||||
__builtin_abort ();
|
||||
if (gangs_min != 0 || gangs_max != gangs_actual - 1
|
||||
|| workers_min != 0 || workers_max != 0
|
||||
|| vectors_min != 0 || vectors_max != 0)
|
||||
__builtin_abort ();
|
||||
#undef GANGS
|
||||
}
|
||||
|
||||
#pragma acc parallel vector_length (2<<20) /* { dg-error "using vector_length" } */
|
||||
/* GP, WS, VS. */
|
||||
{
|
||||
#pragma acc loop vector
|
||||
for (int i = 0; i < 10; i++)
|
||||
dummy[i] = i;
|
||||
#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
|
||||
int gangs_actual = GANGS;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (gangs_actual) \
|
||||
num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
|
||||
{
|
||||
/* We're actually executing with num_gangs (1). */
|
||||
gangs_actual = 1;
|
||||
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (gangs_actual != 1)
|
||||
__builtin_abort ();
|
||||
if (gangs_min != 0 || gangs_max != gangs_actual - 1
|
||||
|| workers_min != 0 || workers_max != 0
|
||||
|| vectors_min != 0 || vectors_max != 0)
|
||||
__builtin_abort ();
|
||||
#undef GANGS
|
||||
}
|
||||
|
||||
/* GR, WP, VS. */
|
||||
{
|
||||
#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
|
||||
int workers_actual = WORKERS;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (workers_actual) \
|
||||
num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
|
||||
{
|
||||
/* We're actually executing with num_workers (1). */
|
||||
workers_actual = 1;
|
||||
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (workers_actual != 1)
|
||||
__builtin_abort ();
|
||||
if (gangs_min != 0 || gangs_max != 0
|
||||
|| workers_min != 0 || workers_max != workers_actual - 1
|
||||
|| vectors_min != 0 || vectors_max != 0)
|
||||
__builtin_abort ();
|
||||
#undef WORKERS
|
||||
}
|
||||
|
||||
/* GR, WS, VP. */
|
||||
{
|
||||
#define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
|
||||
int vectors_actual = VECTORS;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \
|
||||
vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
|
||||
{
|
||||
/* We're actually executing with vector_length (1), just the GCC nvptx
|
||||
back end enforces vector_length (32). */
|
||||
if (acc_on_device (acc_device_nvidia))
|
||||
vectors_actual = 32;
|
||||
else
|
||||
vectors_actual = 1;
|
||||
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (acc_get_device_type () == acc_device_nvidia)
|
||||
{
|
||||
if (vectors_actual != 32)
|
||||
__builtin_abort ();
|
||||
}
|
||||
else
|
||||
if (vectors_actual != 1)
|
||||
__builtin_abort ();
|
||||
if (gangs_min != 0 || gangs_max != 0
|
||||
|| workers_min != 0 || workers_max != 0
|
||||
|| vectors_min != 0 || vectors_max != vectors_actual - 1)
|
||||
__builtin_abort ();
|
||||
#undef VECTORS
|
||||
}
|
||||
|
||||
|
||||
/* High value. */
|
||||
|
||||
/* GR, WS, VS. */
|
||||
{
|
||||
/* There is no actual limit for the number of gangs, so we try with a
|
||||
rather high value. */
|
||||
int gangs = 12345;
|
||||
int gangs_actual = gangs;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (gangs_actual) \
|
||||
reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
|
||||
num_gangs (gangs)
|
||||
{
|
||||
if (acc_on_device (acc_device_host))
|
||||
{
|
||||
/* We're actually executing with num_gangs (1). */
|
||||
gangs_actual = 1;
|
||||
}
|
||||
/* As we're executing GR not GP, don't multiply with a "gangs_actual"
|
||||
factor. */
|
||||
for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (gangs_actual < 1)
|
||||
__builtin_abort ();
|
||||
if (gangs_min != 0 || gangs_max != gangs_actual - 1
|
||||
|| workers_min != 0 || workers_max != 0
|
||||
|| vectors_min != 0 || vectors_max != 0)
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
/* GP, WS, VS. */
|
||||
{
|
||||
/* There is no actual limit for the number of gangs, so we try with a
|
||||
rather high value. */
|
||||
int gangs = 12345;
|
||||
int gangs_actual = gangs;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (gangs_actual) \
|
||||
num_gangs (gangs)
|
||||
{
|
||||
if (acc_on_device (acc_device_host))
|
||||
{
|
||||
/* We're actually executing with num_gangs (1). */
|
||||
gangs_actual = 1;
|
||||
}
|
||||
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (gangs_actual < 1)
|
||||
__builtin_abort ();
|
||||
if (gangs_min != 0 || gangs_max != gangs_actual - 1
|
||||
|| workers_min != 0 || workers_max != 0
|
||||
|| vectors_min != 0 || vectors_max != 0)
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
/* GR, WP, VS. */
|
||||
{
|
||||
/* We try with an outrageously large value. */
|
||||
#define WORKERS 2 << 20
|
||||
int workers_actual = WORKERS;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
|
||||
num_workers (WORKERS)
|
||||
{
|
||||
if (acc_on_device (acc_device_host))
|
||||
{
|
||||
/* We're actually executing with num_workers (1). */
|
||||
workers_actual = 1;
|
||||
}
|
||||
else if (acc_on_device (acc_device_nvidia))
|
||||
{
|
||||
/* The GCC nvptx back end enforces num_workers (32). */
|
||||
workers_actual = 32;
|
||||
}
|
||||
else
|
||||
__builtin_abort ();
|
||||
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (workers_actual < 1)
|
||||
__builtin_abort ();
|
||||
if (gangs_min != 0 || gangs_max != 0
|
||||
|| workers_min != 0 || workers_max != workers_actual - 1
|
||||
|| vectors_min != 0 || vectors_max != 0)
|
||||
__builtin_abort ();
|
||||
#undef WORKERS
|
||||
}
|
||||
|
||||
/* GR, WP, VS. */
|
||||
{
|
||||
/* We try with an outrageously large value. */
|
||||
int workers = 2 << 20;
|
||||
/* For nvptx offloading, this one will not result in "using num_workers
|
||||
(32), ignoring runtime setting", and will in fact try to launch with
|
||||
"num_workers (workers)", which will run into "libgomp: cuLaunchKernel
|
||||
error: invalid argument". So, limit ourselves here. */
|
||||
if (acc_get_device_type () == acc_device_nvidia)
|
||||
workers = 32;
|
||||
int workers_actual = workers;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (workers_actual) \
|
||||
num_workers (workers)
|
||||
{
|
||||
if (acc_on_device (acc_device_host))
|
||||
{
|
||||
/* We're actually executing with num_workers (1). */
|
||||
workers_actual = 1;
|
||||
}
|
||||
else if (acc_on_device (acc_device_nvidia))
|
||||
{
|
||||
/* We're actually executing with num_workers (32). */
|
||||
/* workers_actual = 32; */
|
||||
}
|
||||
else
|
||||
__builtin_abort ();
|
||||
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (workers_actual < 1)
|
||||
__builtin_abort ();
|
||||
if (gangs_min != 0 || gangs_max != 0
|
||||
|| workers_min != 0 || workers_max != workers_actual - 1
|
||||
|| vectors_min != 0 || vectors_max != 0)
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
/* GR, WS, VP. */
|
||||
{
|
||||
/* We try with an outrageously large value. */
|
||||
#define VECTORS 2 << 20
|
||||
int vectors_actual = VECTORS;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
|
||||
vector_length (VECTORS)
|
||||
{
|
||||
if (acc_on_device (acc_device_host))
|
||||
{
|
||||
/* We're actually executing with vector_length (1). */
|
||||
vectors_actual = 1;
|
||||
}
|
||||
else if (acc_on_device (acc_device_nvidia))
|
||||
{
|
||||
/* The GCC nvptx back end enforces vector_length (32). */
|
||||
vectors_actual = 32;
|
||||
}
|
||||
else
|
||||
__builtin_abort ();
|
||||
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (vectors_actual < 1)
|
||||
__builtin_abort ();
|
||||
if (gangs_min != 0 || gangs_max != 0
|
||||
|| workers_min != 0 || workers_max != 0
|
||||
|| vectors_min != 0 || vectors_max != vectors_actual - 1)
|
||||
__builtin_abort ();
|
||||
#undef VECTORS
|
||||
}
|
||||
|
||||
/* GR, WS, VP. */
|
||||
{
|
||||
/* We try with an outrageously large value. */
|
||||
int vectors = 2 << 20;
|
||||
int vectors_actual = vectors;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \
|
||||
vector_length (vectors)
|
||||
{
|
||||
if (acc_on_device (acc_device_host))
|
||||
{
|
||||
/* We're actually executing with vector_length (1). */
|
||||
vectors_actual = 1;
|
||||
}
|
||||
else if (acc_on_device (acc_device_nvidia))
|
||||
{
|
||||
/* The GCC nvptx back end enforces vector_length (32). */
|
||||
vectors_actual = 32;
|
||||
}
|
||||
else
|
||||
__builtin_abort ();
|
||||
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (vectors_actual < 1)
|
||||
__builtin_abort ();
|
||||
if (gangs_min != 0 || gangs_max != 0
|
||||
|| workers_min != 0 || workers_max != 0
|
||||
|| vectors_min != 0 || vectors_max != vectors_actual - 1)
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
|
||||
/* Composition of GP, WP, VP. */
|
||||
{
|
||||
int gangs = 12345;
|
||||
/* With nvptx offloading, multi-level reductions apparently are very slow
|
||||
in the following case. So, limit ourselves here. */
|
||||
if (acc_get_device_type () == acc_device_nvidia)
|
||||
gangs = 3;
|
||||
int gangs_actual = gangs;
|
||||
#define WORKERS 3
|
||||
int workers_actual = WORKERS;
|
||||
#define VECTORS 11
|
||||
int vectors_actual = VECTORS;
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \
|
||||
num_gangs (gangs) \
|
||||
num_workers (WORKERS) \
|
||||
vector_length (VECTORS)
|
||||
{
|
||||
if (acc_on_device (acc_device_host))
|
||||
{
|
||||
/* We're actually executing with num_gangs (1), num_workers (1),
|
||||
vector_length (1). */
|
||||
gangs_actual = 1;
|
||||
workers_actual = 1;
|
||||
vectors_actual = 1;
|
||||
}
|
||||
else if (acc_on_device (acc_device_nvidia))
|
||||
{
|
||||
/* The GCC nvptx back end enforces vector_length (32). */
|
||||
vectors_actual = 32;
|
||||
}
|
||||
else
|
||||
__builtin_abort ();
|
||||
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
|
||||
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
|
||||
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (gangs_min != 0 || gangs_max != gangs_actual - 1
|
||||
|| workers_min != 0 || workers_max != workers_actual - 1
|
||||
|| vectors_min != 0 || vectors_max != vectors_actual - 1)
|
||||
__builtin_abort ();
|
||||
#undef VECTORS
|
||||
#undef WORKERS
|
||||
}
|
||||
|
||||
|
||||
/* We can't test parallelized OpenACC kernels constructs in this way: use of
|
||||
the acc_gang, acc_worker, acc_vector functions will make the construct
|
||||
unparallelizable. */
|
||||
|
||||
|
||||
/* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
|
||||
kernels. */
|
||||
{
|
||||
int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
|
||||
gangs_min = workers_min = vectors_min = INT_MAX;
|
||||
gangs_max = workers_max = vectors_max = INT_MIN;
|
||||
#pragma acc kernels
|
||||
{
|
||||
/* This is to make the OpenACC kernels construct unparallelizable. */
|
||||
asm volatile ("" : : : "memory");
|
||||
|
||||
#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
|
||||
for (int i = 100; i > -100; --i)
|
||||
{
|
||||
gangs_min = gangs_max = acc_gang ();
|
||||
workers_min = workers_max = acc_worker ();
|
||||
vectors_min = vectors_max = acc_vector ();
|
||||
}
|
||||
}
|
||||
if (gangs_min != 0 || gangs_max != 1 - 1
|
||||
|| workers_min != 0 || workers_max != 1 - 1
|
||||
|| vectors_min != 0 || vectors_max != 1 - 1)
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -15,6 +15,13 @@ load_lib libgomp-dg.exp
|
||||
load_gcc_lib gcc-dg.exp
|
||||
load_gcc_lib torture-options.exp
|
||||
|
||||
proc check_effective_target_c { } {
|
||||
return 1
|
||||
}
|
||||
proc check_effective_target_c++ { } {
|
||||
return 0
|
||||
}
|
||||
|
||||
# Initialize dg.
|
||||
dg-init
|
||||
torture-init
|
||||
|
Loading…
Reference in New Issue
Block a user