// Solve the Laplace equation on a 2D lattice with boundary conditions.
// (using texture memory)
//
// compile with the following command:
//
// (for GTX970)
// nvcc -arch=compute_52 -code=sm_52,sm_52 -O3 -m64 -o laplace laplace.cu
//
// (for GTX1060)
// nvcc -arch=compute_61 -code=sm_61,sm_61 -O3 -m64 -o laplace laplace.cu


// Includes
#include 
#include 
#include 

// field variables
float* h_new;	 // host field vectors
float* h_old;	 
float* h_C;	 // sum of diff * diff of each block
float* g_new;	 // device solution back to the host 
float* d_new;	 // device field vectors
float* d_old;	
float* d_C;	 // sum of diff*diff of each block 

int	 MAX = 1000000;		// maximum iterations
double	eps = 1.0e-10;		// stopping criterion


__align__(8) texture	texOld;	 // declare the texture
__align__(8) texture	texNew;


__global__ void laplacianTex(float* phi_old, float* phi_new, float* C, bool flag)
{
	extern __shared__ float cache[];	 
	float	t, l, c, r, b;	 // top, left, center, right, bottom
	float	diff; 
	int	site, ym1, xm1, xp1, yp1;

	int Nx = blockDim.x * gridDim.x;
	int Ny = blockDim.y * gridDim.y;
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	int y = blockDim.y * blockIdx.y + threadIdx.y;
	int cacheIndex = threadIdx.x + threadIdx.y * blockDim.x;	

	site = x + y * Nx;

	if((x == 0) || (x == Nx - 1) || (y == 0) || (y == Ny - 1)) {	
	}
	else {
		xm1 = site - 1;	// x-1
		xp1 = site + 1;	// x+1
		ym1 = site - Nx;	 // y-1
		yp1 = site + Nx;	 // y+1
		if(flag){
			b = tex1Dfetch(texOld, ym1);		// read d_old via texOld
			l = tex1Dfetch(texOld, xm1);
			c = tex1Dfetch(texOld, site);
			r = tex1Dfetch(texOld, xp1);
			t = tex1Dfetch(texOld, yp1);

			phi_new[site] = 0.25 * (b + l + r + t);
			diff = phi_new[site] - c;
		}
		else {
			b = tex1Dfetch(texNew, ym1);	 // read d_new via texNew
			l = tex1Dfetch(texNew, xm1);
			c = tex1Dfetch(texNew, site);
			r = tex1Dfetch(texNew, xp1);
			t = tex1Dfetch(texNew, yp1);

			phi_old[site] = 0.25 * (b + l + r + t);
			diff = phi_old[site] - c;
		}
	}

	// each thread saves its error estimate to the shared memory

	cache[cacheIndex] = diff * diff;	
	__syncthreads();

	// parallel reduction in each block 

	int ib = blockDim.x * blockDim.y / 2;	
	while (ib != 0) {	
		if(cacheIndex < ib)	
			cache[cacheIndex] += cache[cacheIndex + ib];
		__syncthreads();
		ib /= 2;	
	}

	// save the partial sum of each block to C

	int blockIndex = blockIdx.x + gridDim.x * blockIdx.y;
	if(cacheIndex == 0)
		C[blockIndex] = cache[0];	
}

__global__ void laplacian(float* phi_old, float* phi_new, float* C, bool flag)
{
	extern __shared__ float cache[];
	float t, l, r, b;	 // top, left, right, bottom
	float diff; 
	int	site, ym1, xm1, xp1, yp1;

	int Nx = blockDim.x * gridDim.x;
	int Ny = blockDim.y * gridDim.y;
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	int y = blockDim.y * blockIdx.y + threadIdx.y;
	int cacheIndex = threadIdx.x + threadIdx.y * blockDim.x;	

	site = x + y * Nx;

	if((x == 0) || (x == Nx - 1) || (y == 0) || (y == Ny - 1)) {	
	}
	else {
		xm1 = site - 1;	// x-1
		xp1 = site + 1;	// x+1
		ym1 = site - Nx;	 // y-1
		yp1 = site + Nx;	 // y+1
		if(flag){
			b = phi_old[ym1]; 
			l = phi_old[xm1]; 
			r = phi_old[xp1]; 
			t = phi_old[yp1]; 
			phi_new[site] = 0.25 * (b + l + r + t);
		}
		else {
			b = phi_new[ym1]; 
			l = phi_new[xm1]; 
			r = phi_new[xp1]; 
			t = phi_new[yp1]; 
			phi_old[site] = 0.25 * (b + l + r + t);
		}
		diff = phi_new[site] - phi_old[site];
	}
	cache[cacheIndex] = diff * diff;
	__syncthreads();

	// perform parallel reduction

	int ib = blockDim.x * blockDim.y / 2;	
	while (ib != 0) {	
		if(cacheIndex < ib)	
		cache[cacheIndex] += cache[cacheIndex + ib];
		__syncthreads();
		ib /= 2;	
	} 
	int blockIndex = blockIdx.x + gridDim.x * blockIdx.y;
	if(cacheIndex == 0)
		C[blockIndex] = cache[0];
}


int main(void)
{

	double tex_error, error, cpu_error;			// error estimate 

	int gid = 0;

	// Error code to check return values for CUDA calls
	cudaError_t err = cudaSuccess;

	err = cudaSetDevice(gid);
	if (err != cudaSuccess) {
		printf("!!! Cannot select GPU with device ID = %d\n", gid);
		exit(1);
	}
	printf("Select GPU with device ID = %d\n", gid);

	cudaSetDevice(gid);

	printf("Solve Laplace equation on a 2D lattice with boundary conditions (using texture)\n");

	int Nx, Ny;				// lattice size
	printf("Enter the size (Nx, Ny) of the 2D lattice: ");
	scanf("%d %d", &Nx, &Ny);		
	printf("%d %d\n", Nx, Ny);		

	// Set the number of threads (tx, ty) per block
	 
	int tx, ty;
	printf("Enter the number of threads (tx, ty) per block: ");
	scanf("%d %d", &tx, &ty);
	printf("%d %d\n", tx, ty);
	if(tx * ty > 1024) {
		printf("The number of threads per block must be less than 1024 ! \n");
		exit(0);
	}
	dim3 threads(tx, ty); 
	
	// The total number of threads in the grid is equal to the total number of lattice sites
	
	int bx = Nx / tx;
	if(bx * tx != Nx) {
		printf("The blocksize in x is incorrect\n"); 
		exit(0);
	}
	int by = Ny / ty;
	if(by * ty != Ny) {
		printf("The blocksize in y is incorrect\n"); 
		exit(0);
	}
	if((bx > 65535) || (by > 65535)) {
		printf("The grid size exceeds the limit ! \n");
		exit(0);
	}
	dim3 blocks(bx, by);
	printf("The dimension of the grid is (%d, %d)\n", bx, by); 

	int CPU = 1;
	printf("%d\n", CPU);
	fflush(stdout);

	// Allocate field vector h_phi in host memory

	int N = Nx * Ny;
	int size = N * sizeof(float);
	int sb = bx * by * sizeof(float);
	h_old = (float*)malloc(size);
	h_new = (float*)malloc(size);
	g_new = (float*)malloc(size);
	h_C = (float*)malloc(sb);
	 
	memset(h_old, 0, size);	
	memset(h_new, 0, size);

	// Initialize the field vector with boundary conditions

	float top = 0.0, left = 0.0, right = 0.0, bottom = 1.0;
	printf("Boundary condition for top, left, right, bottom: ");
	scanf("%f %f %f %f", &top, &left, &right, &bottom);
	for(int x = 0; x < Nx; x++){
		h_new[x] = top;
		h_old[x] = top;

		h_new[x + Nx * (Ny - 1)] = bottom;	
		h_old[x + Nx * (Ny - 1)] = bottom;
	}
	for(int y = 0; y < Ny; y++){
		h_new[y * Nx] = left;
		h_old[y * Nx] = left;

		h_new[y * Nx + Ny - 1] = right;
		h_old[y * Nx + Ny - 1] = right;
	}

	FILE *out1;				// save initial configuration in phi_initial.dat
	char phi_init_name[80];
	snprintf(phi_init_name, sizeof(phi_init_name), "phi_initial_%d_%d_%f_%f_%f_%f.dat", Nx, tx, top, left, right, bottom);
	out1 = fopen(phi_init_name, "w");

	fprintf(out1, "Inital field configuration:\n");
	for(int j = Ny - 1;j > -1; j--) {
		for(int i = 0; i < Nx; i++) {
			fprintf(out1, "%.2e ", h_new[i + j * Nx]);
		}
		fprintf(out1, "\n");
	}
	fclose(out1);

	// Texture
	// create the timer
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// start the timer
	cudaEventRecord(start,0);

	// Allocate vectors in device memory

	cudaMalloc((void**)&d_new, size);
	cudaMalloc((void**)&d_old, size);
	cudaMalloc((void**)&d_C, sb);

	cudaBindTexture(NULL, texOld, d_old, size);	 // bind the texture
	cudaBindTexture(NULL, texNew, d_new, size);

	// Copy vectors from host memory to device memory

	cudaMemcpy(d_new, h_new, size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_old, h_old, size, cudaMemcpyHostToDevice);
	
	// stop the timer
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float tex_Intime;
	cudaEventElapsedTime(&tex_Intime, start, stop);
	printf("Tex Input time for GPU (Texture): %f (ms) \n", tex_Intime);

	// start the timer
	cudaEventRecord(start, 0);

	tex_error = 10 * eps;		// any value bigger than eps is OK
	int tex_iter = 0, iter = 0, cpu_iter = 0;		// counter for iterations
	double diff; 

	volatile bool flag = true;	 

	int sm = tx * ty * sizeof(float);	 // size of the shared memory in each block

	while((tex_error > eps) && (tex_iter < MAX)){

		laplacianTex<<>>(d_old, d_new, d_C, flag);
		cudaMemcpy(h_C, d_C, sb, cudaMemcpyDeviceToHost);
		tex_error = 0.0;
		for(int i = 0; i< bx * by; i++) {
			tex_error = tex_error + h_C[i];
		}
		tex_error = sqrt(tex_error);

		tex_iter++;
		flag = !flag;

	}
	 
	printf("Tex error (GPU) = %.15e\n", tex_error);
	printf("Tex total iterations (GPU) = %d\n", tex_iter);
	
	// stop the timer
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float tex_gputime;
	double tex_flops;
	cudaEventElapsedTime(&tex_gputime, start, stop);
	printf("Processing time for GPU: %f (ms) \n", tex_gputime);
	tex_flops = 7.0 * (Nx - 2) * (Ny - 2) * tex_iter;
	printf("GPU Gflops: %f\n", tex_flops / (1000000.0 * tex_gputime));
	
	// Copy result from device memory to host memory

	// start the timer
	cudaEventRecord(start, 0);

	cudaMemcpy(g_new, d_new, size, cudaMemcpyDeviceToHost);

	cudaFree(d_new);
	cudaFree(d_old);
	cudaFree(d_C);

	cudaUnbindTexture(texOld);
	cudaUnbindTexture(texNew);

	// stop the timer
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float tex_Outime;
	cudaEventElapsedTime(&tex_Outime, start, stop);
	printf("Output time for GPU (Texture): %f (ms) \n", tex_Outime);

	float tex_gputime_tot;
	tex_gputime_tot = tex_Intime + tex_gputime + tex_Outime;
	printf("Total time for GPU (Texture): %f (ms) \n", tex_gputime_tot);
	fflush(stdout);

	printf("\n");

	FILE *outg_tex;	// save GPU solution in phi_GPU_Tex.dat

	char phi_gpu_tex_name[80];
	snprintf(phi_gpu_tex_name, sizeof(phi_gpu_tex_name), "phi_GPU_Tex_%d_%d_%f_%f_%f_%f.dat", Nx, tx, top, left, right, bottom);
	outg_tex = fopen(phi_gpu_tex_name, "w");

	fprintf(outg_tex, "GPU (using texture) field configuration:\n");
	for(int j = Ny - 1;j > -1; j--){
		for(int i = 0; i < Nx; i++){
			fprintf(outg_tex, "%.2e ", g_new[i + j * Nx]);
		}
		fprintf(outg_tex, "\n");
	}
	fclose(outg_tex);

	// start the timer
	cudaEventRecord(start, 0);

	if(CPU == 0) {
		free(h_new);
		free(h_old);
		free(g_new);
		cudaDeviceReset();
	}

	// Device
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// start the timer
	cudaEventRecord(start,0);

	// Allocate vectors in device memory

	cudaMalloc((void**)&d_new, size);
	cudaMalloc((void**)&d_old, size);
	cudaMalloc((void**)&d_C, sb);

	// Copy vectors from host memory to device memory
	cudaMemcpy(d_new, h_new, size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_old, h_old, size, cudaMemcpyHostToDevice);
	
	// stop the timer
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float Intime;
	cudaEventElapsedTime(&Intime, start, stop);
	printf("Input time for GPU: %f (ms) \n", Intime);

	// start the timer
	cudaEventRecord(start, 0);

	error = 10 * eps;  // any value bigger than eps is OK
	iter = 0;		// counter for iterations
	flag = true; 
 
	sm = tx * ty * sizeof(float);   // size of the shared memory in each block

	while((error > eps) && (iter < MAX)){

		laplacian<<>>(d_old, d_new, d_C, flag);
		cudaMemcpy(h_C, d_C, sb, cudaMemcpyDeviceToHost);
		error = 0.0;
		for(int i = 0; i < bx * by; i++){
			error = error + h_C[i];
		}
		error = sqrt(error);

		iter++;
		flag = !flag;

	}

	printf("error (GPU) = %.15e\n",error);
	printf("total iterations (GPU) = %d\n",iter);
	
	// stop the timer
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float gputime;
	double flops;
	cudaEventElapsedTime(&gputime, start, stop);
	printf("Processing time for GPU: %f (ms) \n", gputime);
	flops = 7.0 * (Nx - 2) * (Ny - 2) * iter;
	printf("GPU Gflops: %f\n", flops / (1000000.0 * gputime));
	
	// Copy result from device memory to host memory

	// start the timer
	cudaEventRecord(start, 0);

	cudaMemcpy(g_new, d_new, size, cudaMemcpyDeviceToHost);

	cudaFree(d_new);
	cudaFree(d_old);
	cudaFree(d_C);

	// stop the timer
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float Outime;
	cudaEventElapsedTime(&Outime, start, stop);
	printf("Output time for GPU: %f (ms) \n", Outime);

	float gputime_tot;
	gputime_tot = Intime + gputime + Outime;
	printf("Total time for GPU: %f (ms) \n", gputime_tot);
	fflush(stdout);

	FILE *outg;	// save GPU solution in phi_GPU.dat

	char phi_gpu_name[80];
	snprintf(phi_gpu_name, sizeof(phi_gpu_name), "phi_GPU_%d_%d_%f_%f_%f_%f.dat", Nx, tx, top, left, right, bottom);
	
	outg = fopen(phi_gpu_name,"w");

	fprintf(outg, "GPU field configuration:\n");
	for(int j = Ny - 1; j > -1; j--) {
		for(int i = 0; i < Nx; i++) {
			fprintf(outg, "%.2e ", g_new[i + j * Nx]);
		}
		fprintf(outg, "\n");
	}
	fclose(outg);

	printf("\n");

 
	// to compute the reference solution

	cpu_error = 10 * eps;		// any value bigger than eps 
	cpu_iter = 0;			// counter for iterations
	flag = true;

	float t, l, r, b;	// top, left, right, bottom
	int site, ym1, xm1, xp1, yp1;

	while ((cpu_error > eps) && (cpu_iter < MAX)){
		if(flag) {
			cpu_error = 0.0;
			for(int y = 0; y < Ny; y++){
				for(int x = 0; x < Nx; x++){ 
					if(x == 0 || x == Nx - 1 || y == 0 || y == Ny - 1) {	 
					}
					else {
						site = x + y * Nx;
						xm1 = site - 1;	// x - 1
						xp1 = site + 1;	// x + 1
						ym1 = site - Nx;	 // y - 1
						yp1 = site + Nx;	 // y + 1
						b = h_old[ym1]; 
						l = h_old[xm1]; 
						r = h_old[xp1]; 
						t = h_old[yp1]; 
						h_new[site] = 0.25 * (b + l + r + t);
						diff = h_new[site] - h_old[site]; 
						cpu_error = cpu_error + diff * diff;
					}
				} 
			} 
		}
		else{
			cpu_error = 0.0;
			for(int y=0; y < Ny; y++){
				for(int x = 0; x < Nx; x++){ 
					if(x == 0 || x == Nx - 1 || y == 0 || y == Ny - 1){
					}
					else{
						site = x + y * Nx;
						xm1 = site - 1;	// x - 1
						xp1 = site + 1;	// x + 1
						ym1 = site - Nx;	 // y - 1
						yp1 = site + Nx;	 // y + 1
						b = h_new[ym1]; 
						l = h_new[xm1]; 
						r = h_new[xp1]; 
						t = h_new[yp1]; 
						h_old[site] = 0.25 * (b + l + r + t);
						diff = h_new[site] - h_old[site]; 
						cpu_error = cpu_error + diff * diff;
					}
				}
			}
		}
		flag = !flag;
		cpu_iter++;
		cpu_error = sqrt(cpu_error);

//		printf("error = %.15e\n",error);
//		printf("iteration = %d\n",iter);

	}	 // exit if error < eps
	
	printf("error (CPU) = %.15e\n", cpu_error);
	printf("total iterations (CPU) = %d\n", cpu_iter);

	// stop the timer
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float cputime;
	double cpu_flops;
	cudaEventElapsedTime(&cputime, start, stop);
	printf("Processing time for CPU: %f (ms) \n", cputime);
	cpu_flops = 7.0 * (Nx - 2) * (Ny - 2) * cpu_iter;
	printf("CPU Gflops: %lf\n", cpu_flops / (1000000.0 * cputime));
	printf("Speed up of GPU (Texture) = %f\n", cputime / (tex_gputime_tot));
	printf("Speed up of GPU = %f\n", cputime / (gputime_tot));
	fflush(stdout);

	// destroy the timer
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	FILE *outc;				 // save CPU solution in phi_CPU.dat
	char phi_cpu_name[80];
	snprintf(phi_cpu_name, sizeof(phi_cpu_name), "phi_CPU_%d_%d_%f_%f_%f_%f.dat", Nx, tx, top, left, right, bottom);
	outc = fopen(phi_cpu_name, "w");

	fprintf(outc, "CPU field configuration:\n");
	for(int j = Ny - 1; j > -1; j--) {
		for(int i = 0; i < Nx; i++) {
			fprintf(outc, "%.2e ", h_new[i + j * Nx]);
		}
		fprintf(outc,"\n");
	}
	fclose(outc);

	printf("%d,%d,%f,%f,%d,%f,%f,%f,%f,%f,%f,%d,%f,%f,%f,%f,%f,%d,%f,%f,%f,%f", 
		Nx, 
		tx, 
		tex_Intime,
		tex_error,
		tex_iter,
		tex_gputime,
		tex_flops / (1000000.0 * tex_gputime),
		tex_Outime,
		tex_gputime_tot,
		Intime,
		error,
		iter,
		gputime,
		flops / (1000000.0 * gputime),
		Outime,
		gputime_tot,
		cpu_error,
		cpu_iter,
		cputime,
		cpu_flops / (1000000.0 * cputime),
		cputime / (tex_gputime_tot),
		cputime / (gputime_tot));

	free(h_new);
	free(h_old);
	free(g_new);

	cudaDeviceReset();
}