CUDA Laplace equation on 2D lattice with texture memory
Introduction
Methods
__align__(8) texture<float> texOld; // declare the texture
__align__(8) texture<float> texNew;
__global__ void laplacianTex(float* phi_old, float* phi_new, float* C, bool flag)
{
extern __shared__ float cache[];
float t, l, c, r, b; // top, left, center, right, bottom
float diff;
int site, ym1, xm1, xp1, yp1;
int Nx = blockDim.x * gridDim.x;
int Ny = blockDim.y * gridDim.y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int cacheIndex = threadIdx.x + threadIdx.y * blockDim.x;
site = x + y * Nx;
if((x == 0) || (x == Nx - 1) || (y == 0) || (y == Ny - 1)) {
}
else {
xm1 = site - 1; // x-1
xp1 = site + 1; // x+1
ym1 = site - Nx; // y-1
yp1 = site + Nx; // y+1
if(flag){
b = tex1Dfetch(texOld, ym1); // read d_old via texOld
l = tex1Dfetch(texOld, xm1);
c = tex1Dfetch(texOld, site);
r = tex1Dfetch(texOld, xp1);
t = tex1Dfetch(texOld, yp1);
phi_new[site] = 0.25 * (b + l + r + t);
diff = phi_new[site] - c;
}
else {
b = tex1Dfetch(texNew, ym1); // read d_new via texNew
l = tex1Dfetch(texNew, xm1);
c = tex1Dfetch(texNew, site);
r = tex1Dfetch(texNew, xp1);
t = tex1Dfetch(texNew, yp1);
phi_old[site] = 0.25 * (b + l + r + t);
diff = phi_old[site] - c;
}
}
// each thread saves its error estimate to the shared memory
cache[cacheIndex] = diff * diff;
__syncthreads();
// parallel reduction in each block
int ib = blockDim.x * blockDim.y / 2;
while (ib != 0) {
if(cacheIndex < ib)
cache[cacheIndex] += cache[cacheIndex + ib];
__syncthreads();
ib /= 2;
}
// save the partial sum of each block to C
int blockIndex = blockIdx.x + gridDim.x * blockIdx.y;
if(cacheIndex == 0)
C[blockIndex] = cache[0];
}
To reproduce the results, firstly run test_params.py. Wait for all jobs to complete. Lastly, run gather_results.py.
test_params.py
import os
import time
lattice_sizes = [32, 64, 128, 256, 512, 640]
block_sizes = [4, 8, 16, 32]
bcs = [(0.0, 0.0, 0.0, 1.0), (1.0, -1.0, -2.0, 5.0)]
for lattice_size in lattice_sizes:
for block_size in block_sizes:
for bc in bcs:
in_name = '{}_{}_{}_{}_{}_{}_in'.format(lattice_size, block_size, bc[0], bc[1], bc[2], bc[3])
cmd_name = 'cmd_{}_{}_{}_{}_{}_{}'.format(lattice_size, block_size, bc[0], bc[1], bc[2], bc[3])
with open(in_name, 'wb') as f:
f.write('{} {}\n{} {}\n{} {} {} {}\n'.format(lattice_size, lattice_size, block_size, block_size,
bc[0], bc[1], bc[2], bc[3]))
with open(cmd_name, 'wb') as f:
cmd_content = ('Universe=vanilla\nExecutable=/opt/bin/runjob\nMachine_count=1\n\n'
'Output=condor.out\nError=condor.err\nLog=condor.log\n\n'
'Requirements=(MTYPE=="TEST")\n\nRequest_cpus=1\n\n'
'Initialdir=/path/to/working/directory\n\nArguments=./laplace {}_{}_{}_{}_{}_{}_in {}_{}_{}_{}_{}_{}_out\n\n'
'Queue\n\n').format(
lattice_size, block_size, bc[0], bc[1], bc[2], bc[3],
lattice_size, block_size, bc[0], bc[1], bc[2], bc[3])
f.write(cmd_content)
os.system('condor_submit {}'.format(cmd_name))
time.sleep(1)
gather_results.py
import os
import subprocess
bcs = [(0.0, 0.0, 0.0, 1.0), (1.0, -1.0, -2.0, 5.0)]
for bc in bcs:
result_name = 'results_{}_{}_{}_{}.csv'.format(bc[0], bc[1], bc[2], bc[3])
with open(result_name, 'wb') as f:
f.write('Size of the 2D lattice (x),'
'Number of threads per block (x),'
'Input time for GPU (Texture) (ms),'
'Error for GPU (Texture),'
'Iteration of GPU (Texture),'
'Processing time for GPU (Texture) (ms),'
'GPU (Texture) Gflops,'
'Output time for GPU (Texture) (ms),'
'Total time for GPU (Texture) (ms),'
'Input time for GPU (ms),'
'Error for GPU,'
'Iteration of GPU,'
'Processing time for GPU (ms),'
'GPU Gflops,'
'Output time for GPU (ms),'
'Total time for GPU (ms),'
'Error for CPU,'
'Iteration of CPU,'
'Processing time for CPU (ms),'
'CPU Gflops,'
'Speed up of GPU (Texture),'
'Speed up of GPU\n')
for file_name in os.listdir('./'):
if '{}_{}_{}_{}_out'.format(bc[0], bc[1], bc[2], bcs[3]) in file_name:
line = subprocess.check_output(['tail', '-1', file_name])
if not ',' in line:
continue
f.write(line + '\n')
f.close()
Results
For the size of 2D lattice less than 128×128, CPU has a better performance than GPU with texture and GPU. Different from the previous homework, this time the memory copying time of input and output only have a little effect on the total, less than 2 milliseconds, comparing to the processing time 60-170000 milliseconds.
While the sizes of 2D lattice are 128×128 and 256×256, GPU with device memory has a better performance the CPU and GPU with texture memory. When the size of 2D lattice is 128×128, the best configuration of number of threads per block is 16×16 and number of blocks per grid will be 8×8. While the size of 2D lattice is 256×256, the best setting of number of threads per block is 8×8 and number of blocks per grid will be 32×32.
However, if the size of the 2d lattice grows over 512×512, texture memory has shown a substantial advantage over GPU and CPU without losing any error. When the size of 2D lattice is 512×512, the besting configuration of number of threads per block is 16×16 and number of blocks per grid will be 32×32. While the size of 2D lattice is 640×640, the best setting of number of threads per block is 8×8 and number of blocks per grid will be 80×80. The best speedup of CPU, GPU, and GPU with texture are highlighted in yellow.
Here is the result for GPU with texture memory. The size of the 2D lattice (y) and the number of threads per block (y) is the same as the value of x, so they are omitted:
Size of the 2D lattice (x) |
Number of threads per block (x) |
Input time (ms) |
Error |
Iteration |
Processing time (ms) |
Gflops |
Output time (ms) |
Total time (ms) |
32 |
4 |
0.53472 |
0 |
2606 |
60.45296 |
0.27158 |
0.375648 |
61.36333 |
32 |
8 |
0.534208 |
0 |
2606 |
59.2072 |
0.277294 |
0.370848 |
60.11226 |
32 |
16 |
0.540384 |
0 |
2606 |
58.31558 |
0.281534 |
0.369632 |
59.2256 |
32 |
32 |
0.53168 |
0 |
2606 |
67.27274 |
0.244048 |
0.371552 |
68.17596 |
64 |
4 |
0.472832 |
0 |
9745 |
227.8239 |
1.15097 |
0.3424 |
228.6391 |
64 |
8 |
0.545952 |
0 |
9745 |
221.5026 |
1.183817 |
0.381376 |
222.4299 |
64 |
16 |
0.54032 |
0 |
9745 |
220.2206 |
1.190708 |
0.378624 |
221.1395 |
64 |
32 |
0.53952 |
0 |
9745 |
281.0736 |
0.932917 |
0.378848 |
281.992 |
128 |
4 |
0.575232 |
0 |
35073 |
1161.547 |
3.355639 |
0.41488 |
1162.537 |
128 |
8 |
0.575168 |
0 |
35073 |
1033.322 |
3.772042 |
0.421888 |
1034.319 |
128 |
16 |
0.581408 |
0 |
35073 |
1025.425 |
3.801092 |
0.417024 |
1026.423 |
128 |
32 |
0.517376 |
0 |
35073 |
940.3292 |
4.145073 |
0.375488 |
941.222 |
256 |
4 |
0.727232 |
0 |
124611 |
6201.993 |
9.073829 |
0.52672 |
6203.247 |
256 |
8 |
0.735008 |
0 |
124611 |
4199.054 |
13.40203 |
0.559168 |
4200.348 |
256 |
16 |
0.66064 |
0 |
124611 |
3460.476 |
16.26245 |
0.497312 |
3461.634 |
256 |
32 |
0.746048 |
0 |
124611 |
4435.073 |
12.68882 |
0.548864 |
4436.368 |
512 |
4 |
1.389184 |
0 |
423553 |
75713.64 |
10.18526 |
1.18976 |
75716.22 |
512 |
8 |
1.503232 |
0 |
423553 |
45034.16 |
17.12395 |
1.272224 |
45036.94 |
512 |
16 |
1.501248 |
0 |
423553 |
27503.4 |
28.03882 |
1.248928 |
27506.15 |
512 |
32 |
1.388832 |
0 |
423553 |
33944.7 |
22.71821 |
1.164576 |
33947.26 |
640 |
4 |
1.663968 |
0 |
619850 |
169852.2 |
10.39812 |
1.705984 |
169855.6 |
640 |
8 |
1.682016 |
0 |
619850 |
85307.44 |
20.70328 |
1.754656 |
85310.88 |
640 |
16 |
1.573088 |
0 |
619850 |
71908.79 |
24.56089 |
1.502496 |
71911.86 |
640 |
32 |
1.701472 |
0 |
619850 |
98867.45 |
17.86375 |
1.638976 |
98870.8 |
Size of the 2D lattice (x) |
Number of threads per block (x) |
Input time (ms) |
Error |
Iteration |
Processing time (ms) |
Gflops |
Output time (ms) |
Total time (ms) |
32 |
4 |
0.430368 |
0 |
2606 |
44.80208 |
0.366452 |
0.344288 |
45.57673 |
32 |
8 |
0.43776 |
0 |
2606 |
43.6656 |
0.375989 |
0.344096 |
44.44746 |
32 |
16 |
0.426784 |
0 |
2606 |
43.69744 |
0.375715 |
0.343456 |
44.46768 |
32 |
32 |
0.440608 |
0 |
2606 |
64.35703 |
0.255105 |
0.344 |
65.14163 |
64 |
4 |
0.378528 |
0 |
9745 |
225.6186 |
1.16222 |
0.317824 |
226.315 |
64 |
8 |
0.43152 |
0 |
9745 |
167.6807 |
1.563796 |
0.35088 |
168.4631 |
64 |
16 |
0.439424 |
0 |
9745 |
172.7931 |
1.517529 |
0.349472 |
173.582 |
64 |
32 |
0.437696 |
0 |
9745 |
240.7429 |
1.089205 |
0.353376 |
241.534 |
128 |
4 |
0.459712 |
0 |
35073 |
967.6324 |
4.028113 |
0.361952 |
968.454 |
128 |
8 |
0.456672 |
0 |
35073 |
901.0295 |
4.325866 |
0.370496 |
901.8567 |
128 |
16 |
0.46256 |
0 |
35073 |
868.0142 |
4.490402 |
0.37264 |
868.8494 |
128 |
32 |
0.41152 |
0 |
35073 |
785.0016 |
4.965254 |
0.3376 |
785.7507 |
256 |
4 |
0.60864 |
0 |
124611 |
5542.933 |
10.15272 |
0.444672 |
5543.986 |
256 |
8 |
0.613536 |
0 |
124611 |
3489.02 |
16.12941 |
0.436608 |
3490.07 |
256 |
16 |
0.54944 |
0 |
124611 |
3227.097 |
17.43853 |
0.406112 |
3228.053 |
256 |
32 |
0.63232 |
0 |
124611 |
3891.992 |
14.45939 |
0.454656 |
3893.079 |
512 |
4 |
1.337792 |
0 |
423553 |
96573.79 |
7.98522 |
0.872256 |
96576 |
512 |
8 |
1.45424 |
0 |
423553 |
64176.78 |
12.01623 |
0.931552 |
64179.16 |
512 |
16 |
1.672544 |
0 |
423553 |
59243.65 |
13.0168 |
0.93696 |
59246.26 |
512 |
32 |
1.33168 |
0 |
423553 |
66473.89 |
11.60099 |
0.84448 |
66476.06 |
640 |
4 |
1.717184 |
0 |
619850 |
235964.7 |
7.48478 |
1.170912 |
235967.5 |
640 |
8 |
1.751296 |
0 |
619850 |
112914.5 |
15.64143 |
1.165984 |
112917.4 |
640 |
16 |
1.562688 |
0 |
619850 |
109190.2 |
16.17493 |
1.04288 |
109192.8 |
640 |
32 |
1.643424 |
0 |
619850 |
148595.7 |
11.88556 |
1.157728 |
148598.5 |
Size of the 2D lattice (x) |
Number of threads per block (x) |
Error |
Iteration |
Processing time (ms) |
Gflops |
Speed up of GPU (Texture) |
Speed up of GPU |
32 |
4 |
0 |
2606 |
13.28211 |
1.236084 |
0.21645 |
0.291423 |
32 |
8 |
0 |
2606 |
12.53434 |
1.309826 |
0.208515 |
0.282003 |
32 |
16 |
0 |
2606 |
12.74694 |
1.287979 |
0.215227 |
0.286656 |
32 |
32 |
0 |
2606 |
12.75574 |
1.287091 |
0.1871 |
0.195816 |
64 |
4 |
0 |
9745 |
123.4744 |
2.123667 |
0.54004 |
0.545586 |
64 |
8 |
0 |
9745 |
150.6378 |
1.740721 |
0.677237 |
0.894189 |
64 |
16 |
0 |
9745 |
151.4719 |
1.731136 |
0.684961 |
0.872624 |
64 |
32 |
0 |
9745 |
151.1162 |
1.735211 |
0.535888 |
0.625652 |
128 |
4 |
0 |
35073 |
2185.981 |
1.783059 |
1.880353 |
2.257186 |
128 |
8 |
0 |
35073 |
2187.823 |
1.781558 |
2.115231 |
2.42591 |
128 |
16 |
0 |
35073 |
2293.807 |
1.699242 |
2.234758 |
2.640051 |
128 |
32 |
0 |
35073 |
1798.081 |
2.167718 |
1.910369 |
2.288361 |
256 |
4 |
0 |
124611 |
32237.32 |
1.745673 |
5.196846 |
5.814826 |
256 |
8 |
0 |
124611 |
32830.42 |
1.714136 |
7.81612 |
9.40681 |
256 |
16 |
0 |
124611 |
26357.79 |
2.135074 |
7.614263 |
8.165229 |
256 |
32 |
0 |
124611 |
33139.02 |
1.698174 |
7.469854 |
8.512291 |
512 |
4 |
0 |
423553 |
362858.6 |
2.125244 |
4.792349 |
3.757233 |
512 |
8 |
0 |
423553 |
425090.2 |
1.814116 |
9.4387 |
6.623492 |
512 |
16 |
0 |
423553 |
502324.2 |
1.53519 |
18.26225 |
8.47858 |
512 |
32 |
0 |
423553 |
360227.9 |
2.140764 |
10.6114 |
5.418911 |
640 |
4 |
0 |
619850 |
997622.3 |
1.770353 |
5.873357 |
4.227795 |
640 |
8 |
0 |
619850 |
1178118 |
1.499123 |
13.8097 |
10.43345 |
640 |
16 |
0 |
619850 |
834376.6 |
2.116722 |
11.60277 |
7.641317 |
640 |
32 |
0 |
619850 |
998127 |
1.769458 |
10.09527 |
6.716939 |
Conclusions
- For Laplace equation on 2D lattice, the speedup of GPU appears when the size of lattice square is larger than 128×128.
- The benefit of texture only appears when the size of lattice square grows larger than 512×512.
- The best configuration for size of 2D lattice 512×512 is 16×16 number of threads per blocks in GPU with device memory when the boundary condition is (ϕ_top,ϕ_left,ϕ_right,ϕ_bottom)=(+1,-1,-2,+5).