// Solve the Laplace equation on a 2D lattice with boundary conditions.
// (using texture memory)
//
// compile with the following command:
//
// (for GTX970)
// nvcc -arch=compute_52 -code=sm_52,sm_52 -O3 -m64 -o laplace laplace.cu
//
// (for GTX1060)
// nvcc -arch=compute_61 -code=sm_61,sm_61 -O3 -m64 -o laplace laplace.cu
// Includes
#include
#include
#include
// field variables
float* h_new; // host field vectors
float* h_old;
float* h_C; // sum of diff * diff of each block
float* g_new; // device solution back to the host
float* d_new; // device field vectors
float* d_old;
float* d_C; // sum of diff*diff of each block
int MAX = 1000000; // maximum iterations
double eps = 1.0e-10; // stopping criterion
__align__(8) texture texOld; // declare the texture
__align__(8) texture texNew;
__global__ void laplacianTex(float* phi_old, float* phi_new, float* C, bool flag)
{
extern __shared__ float cache[];
float t, l, c, r, b; // top, left, center, right, bottom
float diff;
int site, ym1, xm1, xp1, yp1;
int Nx = blockDim.x * gridDim.x;
int Ny = blockDim.y * gridDim.y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int cacheIndex = threadIdx.x + threadIdx.y * blockDim.x;
site = x + y * Nx;
if((x == 0) || (x == Nx - 1) || (y == 0) || (y == Ny - 1)) {
}
else {
xm1 = site - 1; // x-1
xp1 = site + 1; // x+1
ym1 = site - Nx; // y-1
yp1 = site + Nx; // y+1
if(flag){
b = tex1Dfetch(texOld, ym1); // read d_old via texOld
l = tex1Dfetch(texOld, xm1);
c = tex1Dfetch(texOld, site);
r = tex1Dfetch(texOld, xp1);
t = tex1Dfetch(texOld, yp1);
phi_new[site] = 0.25 * (b + l + r + t);
diff = phi_new[site] - c;
}
else {
b = tex1Dfetch(texNew, ym1); // read d_new via texNew
l = tex1Dfetch(texNew, xm1);
c = tex1Dfetch(texNew, site);
r = tex1Dfetch(texNew, xp1);
t = tex1Dfetch(texNew, yp1);
phi_old[site] = 0.25 * (b + l + r + t);
diff = phi_old[site] - c;
}
}
// each thread saves its error estimate to the shared memory
cache[cacheIndex] = diff * diff;
__syncthreads();
// parallel reduction in each block
int ib = blockDim.x * blockDim.y / 2;
while (ib != 0) {
if(cacheIndex < ib)
cache[cacheIndex] += cache[cacheIndex + ib];
__syncthreads();
ib /= 2;
}
// save the partial sum of each block to C
int blockIndex = blockIdx.x + gridDim.x * blockIdx.y;
if(cacheIndex == 0)
C[blockIndex] = cache[0];
}
__global__ void laplacian(float* phi_old, float* phi_new, float* C, bool flag)
{
extern __shared__ float cache[];
float t, l, r, b; // top, left, right, bottom
float diff;
int site, ym1, xm1, xp1, yp1;
int Nx = blockDim.x * gridDim.x;
int Ny = blockDim.y * gridDim.y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int cacheIndex = threadIdx.x + threadIdx.y * blockDim.x;
site = x + y * Nx;
if((x == 0) || (x == Nx - 1) || (y == 0) || (y == Ny - 1)) {
}
else {
xm1 = site - 1; // x-1
xp1 = site + 1; // x+1
ym1 = site - Nx; // y-1
yp1 = site + Nx; // y+1
if(flag){
b = phi_old[ym1];
l = phi_old[xm1];
r = phi_old[xp1];
t = phi_old[yp1];
phi_new[site] = 0.25 * (b + l + r + t);
}
else {
b = phi_new[ym1];
l = phi_new[xm1];
r = phi_new[xp1];
t = phi_new[yp1];
phi_old[site] = 0.25 * (b + l + r + t);
}
diff = phi_new[site] - phi_old[site];
}
cache[cacheIndex] = diff * diff;
__syncthreads();
// perform parallel reduction
int ib = blockDim.x * blockDim.y / 2;
while (ib != 0) {
if(cacheIndex < ib)
cache[cacheIndex] += cache[cacheIndex + ib];
__syncthreads();
ib /= 2;
}
int blockIndex = blockIdx.x + gridDim.x * blockIdx.y;
if(cacheIndex == 0)
C[blockIndex] = cache[0];
}
int main(void)
{
double tex_error, error, cpu_error; // error estimate
int gid = 0;
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
err = cudaSetDevice(gid);
if (err != cudaSuccess) {
printf("!!! Cannot select GPU with device ID = %d\n", gid);
exit(1);
}
printf("Select GPU with device ID = %d\n", gid);
cudaSetDevice(gid);
printf("Solve Laplace equation on a 2D lattice with boundary conditions (using texture)\n");
int Nx, Ny; // lattice size
printf("Enter the size (Nx, Ny) of the 2D lattice: ");
scanf("%d %d", &Nx, &Ny);
printf("%d %d\n", Nx, Ny);
// Set the number of threads (tx, ty) per block
int tx, ty;
printf("Enter the number of threads (tx, ty) per block: ");
scanf("%d %d", &tx, &ty);
printf("%d %d\n", tx, ty);
if(tx * ty > 1024) {
printf("The number of threads per block must be less than 1024 ! \n");
exit(0);
}
dim3 threads(tx, ty);
// The total number of threads in the grid is equal to the total number of lattice sites
int bx = Nx / tx;
if(bx * tx != Nx) {
printf("The blocksize in x is incorrect\n");
exit(0);
}
int by = Ny / ty;
if(by * ty != Ny) {
printf("The blocksize in y is incorrect\n");
exit(0);
}
if((bx > 65535) || (by > 65535)) {
printf("The grid size exceeds the limit ! \n");
exit(0);
}
dim3 blocks(bx, by);
printf("The dimension of the grid is (%d, %d)\n", bx, by);
int CPU = 1;
printf("%d\n", CPU);
fflush(stdout);
// Allocate field vector h_phi in host memory
int N = Nx * Ny;
int size = N * sizeof(float);
int sb = bx * by * sizeof(float);
h_old = (float*)malloc(size);
h_new = (float*)malloc(size);
g_new = (float*)malloc(size);
h_C = (float*)malloc(sb);
memset(h_old, 0, size);
memset(h_new, 0, size);
// Initialize the field vector with boundary conditions
float top = 0.0, left = 0.0, right = 0.0, bottom = 1.0;
printf("Boundary condition for top, left, right, bottom: ");
scanf("%f %f %f %f", &top, &left, &right, &bottom);
for(int x = 0; x < Nx; x++){
h_new[x] = top;
h_old[x] = top;
h_new[x + Nx * (Ny - 1)] = bottom;
h_old[x + Nx * (Ny - 1)] = bottom;
}
for(int y = 0; y < Ny; y++){
h_new[y * Nx] = left;
h_old[y * Nx] = left;
h_new[y * Nx + Ny - 1] = right;
h_old[y * Nx + Ny - 1] = right;
}
FILE *out1; // save initial configuration in phi_initial.dat
char phi_init_name[80];
snprintf(phi_init_name, sizeof(phi_init_name), "phi_initial_%d_%d_%f_%f_%f_%f.dat", Nx, tx, top, left, right, bottom);
out1 = fopen(phi_init_name, "w");
fprintf(out1, "Inital field configuration:\n");
for(int j = Ny - 1;j > -1; j--) {
for(int i = 0; i < Nx; i++) {
fprintf(out1, "%.2e ", h_new[i + j * Nx]);
}
fprintf(out1, "\n");
}
fclose(out1);
// Texture
// create the timer
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// start the timer
cudaEventRecord(start,0);
// Allocate vectors in device memory
cudaMalloc((void**)&d_new, size);
cudaMalloc((void**)&d_old, size);
cudaMalloc((void**)&d_C, sb);
cudaBindTexture(NULL, texOld, d_old, size); // bind the texture
cudaBindTexture(NULL, texNew, d_new, size);
// Copy vectors from host memory to device memory
cudaMemcpy(d_new, h_new, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_old, h_old, size, cudaMemcpyHostToDevice);
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float tex_Intime;
cudaEventElapsedTime(&tex_Intime, start, stop);
printf("Tex Input time for GPU (Texture): %f (ms) \n", tex_Intime);
// start the timer
cudaEventRecord(start, 0);
tex_error = 10 * eps; // any value bigger than eps is OK
int tex_iter = 0, iter = 0, cpu_iter = 0; // counter for iterations
double diff;
volatile bool flag = true;
int sm = tx * ty * sizeof(float); // size of the shared memory in each block
while((tex_error > eps) && (tex_iter < MAX)){
laplacianTex<<>>(d_old, d_new, d_C, flag);
cudaMemcpy(h_C, d_C, sb, cudaMemcpyDeviceToHost);
tex_error = 0.0;
for(int i = 0; i< bx * by; i++) {
tex_error = tex_error + h_C[i];
}
tex_error = sqrt(tex_error);
tex_iter++;
flag = !flag;
}
printf("Tex error (GPU) = %.15e\n", tex_error);
printf("Tex total iterations (GPU) = %d\n", tex_iter);
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float tex_gputime;
double tex_flops;
cudaEventElapsedTime(&tex_gputime, start, stop);
printf("Processing time for GPU: %f (ms) \n", tex_gputime);
tex_flops = 7.0 * (Nx - 2) * (Ny - 2) * tex_iter;
printf("GPU Gflops: %f\n", tex_flops / (1000000.0 * tex_gputime));
// Copy result from device memory to host memory
// start the timer
cudaEventRecord(start, 0);
cudaMemcpy(g_new, d_new, size, cudaMemcpyDeviceToHost);
cudaFree(d_new);
cudaFree(d_old);
cudaFree(d_C);
cudaUnbindTexture(texOld);
cudaUnbindTexture(texNew);
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float tex_Outime;
cudaEventElapsedTime(&tex_Outime, start, stop);
printf("Output time for GPU (Texture): %f (ms) \n", tex_Outime);
float tex_gputime_tot;
tex_gputime_tot = tex_Intime + tex_gputime + tex_Outime;
printf("Total time for GPU (Texture): %f (ms) \n", tex_gputime_tot);
fflush(stdout);
printf("\n");
FILE *outg_tex; // save GPU solution in phi_GPU_Tex.dat
char phi_gpu_tex_name[80];
snprintf(phi_gpu_tex_name, sizeof(phi_gpu_tex_name), "phi_GPU_Tex_%d_%d_%f_%f_%f_%f.dat", Nx, tx, top, left, right, bottom);
outg_tex = fopen(phi_gpu_tex_name, "w");
fprintf(outg_tex, "GPU (using texture) field configuration:\n");
for(int j = Ny - 1;j > -1; j--){
for(int i = 0; i < Nx; i++){
fprintf(outg_tex, "%.2e ", g_new[i + j * Nx]);
}
fprintf(outg_tex, "\n");
}
fclose(outg_tex);
// start the timer
cudaEventRecord(start, 0);
if(CPU == 0) {
free(h_new);
free(h_old);
free(g_new);
cudaDeviceReset();
}
// Device
cudaEventCreate(&start);
cudaEventCreate(&stop);
// start the timer
cudaEventRecord(start,0);
// Allocate vectors in device memory
cudaMalloc((void**)&d_new, size);
cudaMalloc((void**)&d_old, size);
cudaMalloc((void**)&d_C, sb);
// Copy vectors from host memory to device memory
cudaMemcpy(d_new, h_new, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_old, h_old, size, cudaMemcpyHostToDevice);
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float Intime;
cudaEventElapsedTime(&Intime, start, stop);
printf("Input time for GPU: %f (ms) \n", Intime);
// start the timer
cudaEventRecord(start, 0);
error = 10 * eps; // any value bigger than eps is OK
iter = 0; // counter for iterations
flag = true;
sm = tx * ty * sizeof(float); // size of the shared memory in each block
while((error > eps) && (iter < MAX)){
laplacian<<>>(d_old, d_new, d_C, flag);
cudaMemcpy(h_C, d_C, sb, cudaMemcpyDeviceToHost);
error = 0.0;
for(int i = 0; i < bx * by; i++){
error = error + h_C[i];
}
error = sqrt(error);
iter++;
flag = !flag;
}
printf("error (GPU) = %.15e\n",error);
printf("total iterations (GPU) = %d\n",iter);
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float gputime;
double flops;
cudaEventElapsedTime(&gputime, start, stop);
printf("Processing time for GPU: %f (ms) \n", gputime);
flops = 7.0 * (Nx - 2) * (Ny - 2) * iter;
printf("GPU Gflops: %f\n", flops / (1000000.0 * gputime));
// Copy result from device memory to host memory
// start the timer
cudaEventRecord(start, 0);
cudaMemcpy(g_new, d_new, size, cudaMemcpyDeviceToHost);
cudaFree(d_new);
cudaFree(d_old);
cudaFree(d_C);
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float Outime;
cudaEventElapsedTime(&Outime, start, stop);
printf("Output time for GPU: %f (ms) \n", Outime);
float gputime_tot;
gputime_tot = Intime + gputime + Outime;
printf("Total time for GPU: %f (ms) \n", gputime_tot);
fflush(stdout);
FILE *outg; // save GPU solution in phi_GPU.dat
char phi_gpu_name[80];
snprintf(phi_gpu_name, sizeof(phi_gpu_name), "phi_GPU_%d_%d_%f_%f_%f_%f.dat", Nx, tx, top, left, right, bottom);
outg = fopen(phi_gpu_name,"w");
fprintf(outg, "GPU field configuration:\n");
for(int j = Ny - 1; j > -1; j--) {
for(int i = 0; i < Nx; i++) {
fprintf(outg, "%.2e ", g_new[i + j * Nx]);
}
fprintf(outg, "\n");
}
fclose(outg);
printf("\n");
// to compute the reference solution
cpu_error = 10 * eps; // any value bigger than eps
cpu_iter = 0; // counter for iterations
flag = true;
float t, l, r, b; // top, left, right, bottom
int site, ym1, xm1, xp1, yp1;
while ((cpu_error > eps) && (cpu_iter < MAX)){
if(flag) {
cpu_error = 0.0;
for(int y = 0; y < Ny; y++){
for(int x = 0; x < Nx; x++){
if(x == 0 || x == Nx - 1 || y == 0 || y == Ny - 1) {
}
else {
site = x + y * Nx;
xm1 = site - 1; // x - 1
xp1 = site + 1; // x + 1
ym1 = site - Nx; // y - 1
yp1 = site + Nx; // y + 1
b = h_old[ym1];
l = h_old[xm1];
r = h_old[xp1];
t = h_old[yp1];
h_new[site] = 0.25 * (b + l + r + t);
diff = h_new[site] - h_old[site];
cpu_error = cpu_error + diff * diff;
}
}
}
}
else{
cpu_error = 0.0;
for(int y=0; y < Ny; y++){
for(int x = 0; x < Nx; x++){
if(x == 0 || x == Nx - 1 || y == 0 || y == Ny - 1){
}
else{
site = x + y * Nx;
xm1 = site - 1; // x - 1
xp1 = site + 1; // x + 1
ym1 = site - Nx; // y - 1
yp1 = site + Nx; // y + 1
b = h_new[ym1];
l = h_new[xm1];
r = h_new[xp1];
t = h_new[yp1];
h_old[site] = 0.25 * (b + l + r + t);
diff = h_new[site] - h_old[site];
cpu_error = cpu_error + diff * diff;
}
}
}
}
flag = !flag;
cpu_iter++;
cpu_error = sqrt(cpu_error);
// printf("error = %.15e\n",error);
// printf("iteration = %d\n",iter);
} // exit if error < eps
printf("error (CPU) = %.15e\n", cpu_error);
printf("total iterations (CPU) = %d\n", cpu_iter);
// stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float cputime;
double cpu_flops;
cudaEventElapsedTime(&cputime, start, stop);
printf("Processing time for CPU: %f (ms) \n", cputime);
cpu_flops = 7.0 * (Nx - 2) * (Ny - 2) * cpu_iter;
printf("CPU Gflops: %lf\n", cpu_flops / (1000000.0 * cputime));
printf("Speed up of GPU (Texture) = %f\n", cputime / (tex_gputime_tot));
printf("Speed up of GPU = %f\n", cputime / (gputime_tot));
fflush(stdout);
// destroy the timer
cudaEventDestroy(start);
cudaEventDestroy(stop);
FILE *outc; // save CPU solution in phi_CPU.dat
char phi_cpu_name[80];
snprintf(phi_cpu_name, sizeof(phi_cpu_name), "phi_CPU_%d_%d_%f_%f_%f_%f.dat", Nx, tx, top, left, right, bottom);
outc = fopen(phi_cpu_name, "w");
fprintf(outc, "CPU field configuration:\n");
for(int j = Ny - 1; j > -1; j--) {
for(int i = 0; i < Nx; i++) {
fprintf(outc, "%.2e ", h_new[i + j * Nx]);
}
fprintf(outc,"\n");
}
fclose(outc);
printf("%d,%d,%f,%f,%d,%f,%f,%f,%f,%f,%f,%d,%f,%f,%f,%f,%f,%d,%f,%f,%f,%f",
Nx,
tx,
tex_Intime,
tex_error,
tex_iter,
tex_gputime,
tex_flops / (1000000.0 * tex_gputime),
tex_Outime,
tex_gputime_tot,
Intime,
error,
iter,
gputime,
flops / (1000000.0 * gputime),
Outime,
gputime_tot,
cpu_error,
cpu_iter,
cputime,
cpu_flops / (1000000.0 * cputime),
cputime / (tex_gputime_tot),
cputime / (gputime_tot));
free(h_new);
free(h_old);
free(g_new);
cudaDeviceReset();
}