Pthread fast matrix multiplication
Problem description
Matrix multiplication is the easiest parallelizable operations that can introduce a huge speedup gain when the matrixes have a large size. There are many libraries can help parallelizing matrix computations. POSIX Thread is a parallel execution model that exists independently from a language. It can programmer fine control over CPU threads in contrast to OpenMP and other high level libraries.
matrix.h
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);
matrix.c
This implementation splits the work of matrix multiplication by row. Let's say there are four threads (NUM_T = 4) and the size of the matrix is 4 (N = 4). Then for the first thread, its argument (struct arguments) start_row is 0 and end_row is 1.
#include "matrix.h"
#include <pthread.h>
// Number of thread
#define NUM_T 1
// Global variable for the size of the square matrix
int NG;
// Global variable for the convinence for multiple process
// to access matrix A, B, and C
unsigned long (*AG)[2048];
unsigned long (*BG)[2048];
unsigned long (*CG)[2048];
// Additional B temporary matrix is the tranpose of the B
// matrix created for the faster row-wise access speed
unsigned long BT[2048][2048];
// arguments type created for pthread arguments
typedef struct arguments{
int start_row;
int end_row;
}arguments;
// compute the multiplication of matrix rows specified
// by the arguments defined by starting row and ending
// row
void* compute_row(void* args){
arguments* argument = (arguments*)args;
int start_row = argument->start_row;
int end_row = argument->end_row;
for(int i = start_row; i < end_row + 1; i++){
for (int j = 0; j < NG; j++) {
unsigned long sum = 0; // overflow, let it go.
for (int k = 0; k < NG; k++)
sum += AG[i][k] * BT[j][k];
CG[i][j] = sum;
}
}
pthread_exit(NULL);
}
// multiple two square matrix, A and B, with size N
// and return the value into C
void multiply(int N, unsigned long A[][2048],
unsigned long B[][2048],
unsigned long C[][2048]) {
// assign matrix value and matrix size into local variables
NG = N;
AG = A;
BG = B;
CG = C;
// initialize pthread attribute value
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
pthread_t thread_pool[NUM_T];
arguments args_pool[NUM_T];
// assign B matrix value into tranposed B temporary matrix
for(int i = 0; i < N; i++){
for(int j = 0; j < N; j++){
BT[j][i] = BG[i][j];
}
}
// if the number of matrix row is smaller than the total number of thread
// then use the number of matrix row as the number of spliting the work
int n_split = N < NUM_T ? N : NUM_T;
// calculate the number of work row in each split
int n_work = N < NUM_T ? 1 : N / NUM_T;
for (int i = 0; i < n_split; i++) {
arguments args;
// calculate the starting and ending index of the row
args.start_row = i * n_work;
args.end_row = args.start_row + n_work;
args_pool[i] = args;
// create a thread, store in the thread pool, and assign its work
// by argments pool
pthread_create(&thread_pool[i], &attr, compute_row, (void*) &args_pool[i]);
}
for(int i = 0; i < n_split; i++){
// wait for all threads to finish their jobs
pthread_join(thread_pool[i], NULL);
}
pthread_attr_destroy(&attr);
}
main.c
Here, we consider two matrixes (A and B) with size N*N. For convenience, the matrixes are created from a sudo random number generator and the result of multiplication is hashed to a signature for verification. The first input is the size N, the second is the seed for matrix A, and the third is the seed for matrix B.
#include <stdio.h>
#include "matrix.h"
// #define DEBUG
#define UINT unsigned long
#define MAXN 2048
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT hash(UINT x) {
return (x * 2654435761LU);
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = hash(h + A[i][j]);
}
return h;
}
static UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
int N, S1, S2;
while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
printf("%d %d %d\n", N, S1, S2);
rand_gen(S1, N, A);
rand_gen(S2, N, B);
multiply(N, A, B, C);
#ifdef DEBUG
print_matrix(N, A);
print_matrix(N, B);
print_matrix(N, C);
#endif
printf("%u\n", signature(N, C));
}
return 0;
}