Pthread fast matrix multiplication

Problem description

Matrix multiplication is the easiest parallelizable operations that can introduce a huge speedup gain when the matrixes have a large size. There are many libraries can help parallelizing matrix computations. POSIX Thread is a parallel execution model that exists independently from a language. It can programmer fine control over CPU threads in contrast to OpenMP and other high level libraries.

matrix.h

void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

This implementation splits the work of matrix multiplication by row. Let's say there are four threads (NUM_T = 4) and the size of the matrix is 4 (N = 4). Then for the first thread, its argument (struct arguments) start_row is 0 and end_row is 1.

#include "matrix.h"
#include <pthread.h>

// Number of thread
#define NUM_T 1

// Global variable for the size of the square matrix
int NG;

// Global variable for the convinence for multiple process
// to access matrix A, B, and C
unsigned long (*AG)[2048];
unsigned long (*BG)[2048];
unsigned long (*CG)[2048];

// Additional B temporary matrix is the tranpose of the B
// matrix created for the faster row-wise access speed
unsigned long BT[2048][2048];

// arguments type created for pthread arguments
typedef struct arguments{
    int start_row;
    int end_row;
}arguments;

// compute the multiplication of matrix rows specified
// by the arguments defined by starting row and ending
// row
void* compute_row(void* args){
    arguments* argument = (arguments*)args;
    int start_row = argument->start_row;
    int end_row = argument->end_row;
    for(int i = start_row; i < end_row + 1; i++){
        for (int j = 0; j < NG; j++) {
            unsigned long sum = 0;    // overflow, let it go.
            for (int k = 0; k < NG; k++)
                sum += AG[i][k] * BT[j][k];
            CG[i][j] = sum;
        }
    }
    pthread_exit(NULL);
}

// multiple two square matrix, A and B, with size N
// and return the value into C
void multiply(int N, unsigned long A[][2048], 
    unsigned long B[][2048], 
    unsigned long C[][2048]) {
    // assign matrix value and matrix size into local variables
    NG = N;
    AG = A;
    BG = B;
    CG = C;

    // initialize pthread attribute value
    pthread_attr_t attr;
    pthread_attr_init(&attr);
    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
    
    pthread_t thread_pool[NUM_T];
    arguments args_pool[NUM_T];

    // assign B matrix value into tranposed B temporary matrix
    for(int i = 0; i < N; i++){
        for(int j = 0; j < N; j++){
            BT[j][i] = BG[i][j];
        }
    }

    // if the number of matrix row is smaller than the total number of thread
    // then use the number of matrix row as the number of spliting the work
    int n_split = N < NUM_T ? N : NUM_T;
    // calculate the number of work row in each split
    int n_work = N < NUM_T ? 1 : N / NUM_T;

    for (int i = 0; i < n_split; i++) {
        arguments args;
        // calculate the starting and ending index of the row
        args.start_row = i * n_work;
        args.end_row = args.start_row + n_work;
        args_pool[i] = args;
        // create a thread, store in the thread pool, and assign its work
        // by argments pool
        pthread_create(&thread_pool[i], &attr, compute_row, (void*) &args_pool[i]);
    }

    for(int i = 0; i < n_split; i++){
        // wait for all threads to finish their jobs
        pthread_join(thread_pool[i], NULL);
    }
    pthread_attr_destroy(&attr);
}

main.c

Here, we consider two matrixes (A and B) with size N*N. For convenience, the matrixes are created from a sudo random number generator and the result of multiplication is hashed to a signature for verification. The first input is the size N, the second is the seed for matrix A, and the third is the seed for matrix B.

#include <stdio.h>
#include "matrix.h"
 
// #define DEBUG
#define UINT unsigned long
#define MAXN 2048
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
    UINT x = 2, n = N*N;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            x = (x * x + c + i + j)%n;
            A[i][j] = x;
        }
    }
}
void print_matrix(int N, UINT A[][MAXN]) {
    for (int i = 0; i < N; i++) {
        fprintf(stderr, "[");
        for (int j = 0; j < N; j++)
            fprintf(stderr, " %u", A[i][j]);
        fprintf(stderr, " ]\n");
    }
}
UINT hash(UINT x) {
    return (x * 2654435761LU);
}
UINT signature(int N, UINT A[][MAXN]) {
    UINT h = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++)
            h = hash(h + A[i][j]);
    }
    return h;
}
static UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
    int N, S1, S2;
    while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
        printf("%d %d %d\n", N, S1, S2);
        rand_gen(S1, N, A);
        rand_gen(S2, N, B);
        multiply(N, A, B, C);
#ifdef DEBUG
        print_matrix(N, A);
        print_matrix(N, B);
        print_matrix(N, C);
#endif
        printf("%u\n", signature(N, C));
    }
    return 0;
}

Reference

10080. Fast Matrix Multiplication (pthread)