// Includes
#include
#include
// Defines
#define RAND_MIN -1.0
// Variables
float* h_A; // host vectors
float* h_B;
float* d_A; // device vectors
float* d_B;
// Functions
void RandomInit(float*, int);
// Device code
__global__ void VecMax(const float* A, float* B, int N)
{
extern __shared__ float cache[];
int i = blockDim.x * blockIdx.x + threadIdx.x;
int cacheIndex = threadIdx.x;
float temp = RAND_MIN; // register for each thread
while (i < N) {
if(A[i] > temp)
temp = A[i];
i += blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp; // set the cache value
__syncthreads();
// perform parallel reduction, threadsPerBlock must be 2^m
int ib = blockDim.x / 2;
while (ib != 0) {
if(cacheIndex < ib && cache[cacheIndex + ib] > cache[cacheIndex])
cache[cacheIndex] = cache[cacheIndex + ib];
__syncthreads();
ib /= 2;
}
if(cacheIndex == 0)
B[blockIdx.x] = cache[0];
}
// Host code
int main(void)
{
int gid = 0;
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
err = cudaSetDevice(gid);
if (err != cudaSuccess) {
printf("!!! Cannot select GPU with device ID = %d\n", gid);
exit(1);
}
printf("Set GPU with device ID = %d\n", gid);
cudaSetDevice(gid);
printf("Maximum of an array: A\n");
int N;
printf("Enter the size of the vectors: ");
scanf("%d", &N);
printf("%d\n", N);
// Set the sizes of threads and blocks
int threadsPerBlock;
printf("Enter the number (2^m) of threads per block: ");
scanf("%d", &threadsPerBlock);
printf("%d\n", threadsPerBlock);
if(threadsPerBlock > 1024) {
printf("The number of threads per block must be less than 1024 ! \n");
exit(0);
}
int blocksPerGrid;
printf("Enter the number of blocks per grid: ");
scanf("%d", &blocksPerGrid);
printf("%d\n", blocksPerGrid);
if(blocksPerGrid > 2147483647) {
printf("The number of blocks must be less than 2147483647 ! \n");
exit(0);
}
// Allocate input vectors h_A in host memory
int size = N * sizeof(float);
int sb = blocksPerGrid * sizeof(float);
h_A = (float*)malloc(size);
h_B = (float*)malloc(sb); // contains the result of the maximum from each block
// Initialize input vectors
RandomInit(h_A, N);
// create the timer
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// start the timer
cudaEventRecord(start, 0);
// Allocate vectors in device memory
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, sb);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float Intime;
cudaEventElapsedTime(&Intime, start, stop);
printf("Input time for GPU: %f (ms) \n", Intime);
// start the timer
cudaEventRecord(start, 0);
int sm = threadsPerBlock * sizeof(float);
VecMax<<>>(d_A, d_B, N);
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float gputime;
cudaEventElapsedTime(&gputime, start, stop);
printf("Processing time for GPU: %f (ms) \n", gputime);
printf("GPU Gflops: %f\n", N / (1000000.0 * gputime));
// Copy result from device memory to host memory
// h_B contains the result of each block in host memory
// start the timer
cudaEventRecord(start, 0);
cudaMemcpy(h_B, d_B, sb, cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
float h_G = 0.0;
for(int i = 0; i < blocksPerGrid; i++)
if(h_B[i] > h_G)
h_G = h_B[i];
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float Outime;
cudaEventElapsedTime(&Outime, start, stop);
printf("Output time for GPU: %f (ms) \n", Outime);
float gputime_tot;
gputime_tot = Intime + gputime + Outime;
printf("Total time for GPU: %f (ms) \n", gputime_tot);
// start the timer
cudaEventRecord(start, 0);
// to compute the reference solution
float h_C = RAND_MIN;
for(int i = 0; i < N; i++)
if(h_A[i] > h_C)
h_C = h_A[i];
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float cputime;
cudaEventElapsedTime(&cputime, start, stop);
printf("Processing time for CPU: %f (ms) \n", cputime);
printf("CPU Gflops: %f\n", N / (1000000.0 * cputime));
printf("Speed up of GPU = %f\n", cputime / (gputime_tot));
// destroy the timer
cudaEventDestroy(start);
cudaEventDestroy(stop);
// check result
printf("Check result:\n");
float diff = abs((h_C - h_G) / h_C);
printf("|(h_G - h_C)/h_C|=%20.15e\n", diff);
printf("h_G = %20.15e\n", h_G);
printf("h_C = %20.15e\n", h_C);
printf("%d,%d,%d,%f,%f,%f,%f,%f,%f,%f,%f,%f",
N,
threadsPerBlock,
blocksPerGrid,
Intime,
gputime,
N / (1000000.0 * gputime),
Outime,
gputime_tot,
cputime,
N / (1000000.0 * cputime),
cputime / (gputime_tot),
diff
);
cudaDeviceReset();
}
// Allocates an array with random float entries in (-1, 1)
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX * 2.0 + RAND_MIN;
}