CUDA 2D matrix element-wise reciprocal addition
Introduction
Methods
// Variables
const int N = 6400;
float h_A[N][N];   // host vectors
float h_B[N][N];
float h_C[N][N];
float h_D[N][N];
float* d_A;   // device vectors
float* d_B;
float* d_C;// Device code
__global__ void VecAdd(const float A[N][N], const float B[N][N], float C[N][N], int N, int work)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;
    if (i < N / work && j < N / work){
        for(int k = 0; k < work; ++k)
            for(int l = 0; l < work; ++l)
                C[i * work + k][j * work + l] = 1.0 / A[i * work + k][j * work + l] + 1.0 / B[i * work + k][j * work + l];
    }
    
    __syncthreads();
}// Allocates an array with random float entries.
void RandomInit(float data[N][N])
{
    for(int i = 0; i < N; ++i)
        for(int j = 0; j < N; ++j)
            data[i][j] = rand() / (float)RAND_MAX;
}Tuning block and grid size
To reproduce the results, firstly run test_params.py. Wait for all jobs to complete. Lastly, run gather_results.py.
test_params.py
import os
import time
testing_params = [(4, 4, 1), (8, 8, 1), (10, 10, 1), (16, 16, 1), (20, 20, 1), (32, 32, 1),
	(16, 16, 2), (16, 16, 4), (16, 16, 16), (16, 16, 80)]
for params in testing_params:
	in_name = '{}_{}_{}_in'.format(params[0], params[1], params[2])
	cmd_name = 'cmd_{}_{}_{}'.format(params[0], params[1], params[2])
	with open(in_name, 'wb') as f:
		f.write('{}\n{}\n{}#\n'.format(params[0], params[1], params[2]))
	with open(cmd_name, 'wb') as f:
		cmd_content = ('Universe=vanilla\nExecutable=/opt/bin/runjob\nMachine_count=1\n\n'
			'Output=condor.out\nError=condor.err\nLog=condor.log\n\n'
			'Requirements=(MTYPE=="TEST")\n\nRequest_cpus=1\n\n'
			'Initialdir=/path/to/working/directory\n\nArguments=./vecAdd_v2 {}_{}_{}_in {}_{}_{}_out\n\n'
			'Queue\n\n').format(
			params[0], params[1], params[2],
			params[0], params[1], params[2])
		f.write(cmd_content)
	os.system('condor_submit {}'.format(cmd_name))
	time.sleep(1)gather_results.py
import os
import subprocess
result_name = 'results.csv'
with open(result_name, 'wb') as f:
	f.write('work for each thread,'
		'number of thread x,'
		'number of thread y,'
		'number of blocks x,'
		'number of blocks y,'
		'input time for GPU,'
		'processing time for GPU (ms),'
		'GPU Gflops,'
		'output time for GPU (ms),'
		'total time for GPU (ms),'
		'processing time for CPU (ms),'
		'CPU Gflops,'
		'speed up of GPU,'
		'norm(h_C - h_D)')
	for file_name in os.listdir('./'):
		if 'out' in file_name:
			line = subprocess.check_output(['tail', '-1', file_name])
			f.write(line + '\n')
	f.close()Results
| work for each thread | dimension of thread x and y | dimension of blocks x and y | input time for GPU | processing time for GPU (ms) | output time for GPU (ms) | total time for GPU (ms) | processing time for CPU (ms) | speed up of GPU | 
| 1 | 4 | 1600 | 61.09626 | 26.14269 | 90.63619 | 177.8751 | 362.3188 | 2.036928 | 
| 1 | 8 | 800 | 61.12278 | 14.18016 | 91.81216 | 167.1151 | 362.2879 | 2.167894 | 
| 1 | 10 | 640 | 61.04918 | 16.49891 | 92.22903 | 169.7771 | 362.9761 | 2.137956 | 
| 1 | 16 | 400 | 61.06547 | 12.70762 | 89.49069 | 163.2638 | 362.535 | 2.220548 | 
| 1 | 20 | 320 | 61.03123 | 15.13014 | 89.81895 | 165.9803 | 362.6187 | 2.184709 | 
| 1 | 32 | 200 | 61.07587 | 14.31136 | 89.54723 | 164.9345 | 362.4345 | 2.197445 | 
| 2 | 16 | 200 | 61.09696 | 11.56202 | 91.93604 | 164.595 | 362.6427 | 2.203243 | 
| 4 | 16 | 100 | 61.10176 | 17.69344 | 89.79328 | 168.5885 | 362.7742 | 2.151833 | 
| 16 | 16 | 25 | 61.11475 | 63.6985 | 89.61117 | 214.4244 | 362.4164 | 1.690182 | 
| 80 | 16 | 5 | 61.04218 | 78.78506 | 92.21085 | 232.0381 | 362.2871 | 1.561326 | 
- GPU spends most of the time on memory copying. Input time and output time has dominated the total time of GPU. To be specific, the best result has spent (61.06547+89.49069)/163.2638 = 92% of time on memory copying.
- Alternating the number of dimension of blocks don’t have much influence on the speed up of GPU in this simple computation.
- Increate the work for each thread can actually reduce speed up of GPU.
Conclusions
- The total time of GPU computing is dominant by the time of memory copying between host and device. So it is critical to minimize the communication overhead.
- Since this work only contains vector addition, a relative easy task, the fine-tuning of block sizes has no effect on the speed up. I’m looking forward to solve harder problems to observe the relations between these settings and speed up.