// Vector addition: C = 1/A + 1/B.
// compile with the following command:
//
// nvcc -O3 -o vecAdd vecAdd.cu
//
// (for GTX970)
// nvcc -arch=compute_52 -code=sm_52,sm_52 -O3 -m64 -o vecAdd vecAdd.cu
//
// (for GTX1060)
// nvcc -arch=compute_61 -code=sm_61,sm_61 -O3 -m64 -o vecAdd vecAdd.cu


// Includes
#include 
#include 

// Defines

// Variables
const int N = 6400;
float h_A[N][N];   // host vectors
float h_B[N][N];
float h_C[N][N];
float h_D[N][N];
float* d_A;   // device vectors
float* d_B;
float* d_C;

// Functions
void RandomInit(float data[N][N]);

/**
 * Get parameters from STDIN.
 */
static void read_from_stdin(int *blockDimx, int *blockDimy, int *work)
{
    char *s, buf[1024];

    fgets(buf, 1023, stdin);
    if ((s = strchr(buf, '#')) != NULL) *s = '\0';
    *blockDimx = atoi(buf);

    fgets(buf, 1023, stdin);
    if ((s = strchr(buf, '#')) != NULL) *s = '\0';
    *blockDimy = atoi(buf);

    fgets(buf, 1023, stdin);
    if ((s = strchr(buf, '#')) != NULL) *s = '\0';
    *work = atoi(buf);
}

// Device code
__global__ void VecAdd(const float A[N][N], const float B[N][N], float C[N][N], int N, int work)
{

    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;
    if (i < N / work && j < N / work){
        for(int k = 0; k < work; ++k)
            for(int l = 0; l < work; ++l)
                C[i * work + k][j * work + l] = 1.0 / A[i * work + k][j * work + l] + 1.0 / B[i * work + k][j * work + l];
    }
    
    __syncthreads();

}

// Host code

int main(int argc, char* argv[])
{
    int gid = 0;
    int blockDimx = 16;
    int blockDimy = 16;
    int work = 1;
    read_from_stdin(&blockDimx, &blockDimy, &work);

    printf("The number of work for each thread is %d\n", work);

    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;

    // scanf("%d", &gid);
    err = cudaSetDevice(gid);
    if(err != cudaSuccess){
        printf("!!! Cannot select GPU with device ID = %d\n", gid);
        exit(1);
    }
    // printf("Set GPU with device ID = %d\n", gid);

    cudaSetDevice(gid);

    // printf("Vector Addition: C = 1/A + 1/B\n");
    int mem = 1024 * 1024 * 1024;     // Giga

next:       
    if(3 * N * N > mem){     // each real number takes 4 bytes
      printf("The size of these 3 vectors cannot be fitted into 4 Gbyte\n");
      goto next;
    }
    long size = N * N * sizeof(float);
    
    // Initialize input vectors

    RandomInit(h_A);
    RandomInit(h_B);

    // Set the sizes of threads and blocks

    dim3 threadsPerBlock(blockDimx, blockDimy);
    printf("The number of thread is (%d, %d)\n", blockDimx, blockDimy);
loop:
    if(threadsPerBlock.x * threadsPerBlock.x > 1024){
      printf("The number of threads per block must be less than 1024 ! \n");
      goto loop;
    }

    dim3 blocksPerGrid(N / blockDimx / work, N / blockDimy / work);
    printf("The number of blocks is (%d, %d)\n", blocksPerGrid.x, blocksPerGrid.y);
    if(blocksPerGrid.x * blocksPerGrid.y > 2147483647){
      printf("The number of blocks must be less than 2147483647 ! \n");
      goto loop;
    }

    // create the timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // start the timer
    cudaEventRecord(start, 0);

    // Allocate vectors in device memory

    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    // Copy vectors from host memory to device memory

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    
    // stop the timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float Intime;
    cudaEventElapsedTime(&Intime, start, stop);
    printf("Input time for GPU: %f (ms) \n", Intime);

    // start the timer
    cudaEventRecord(start, 0);

    VecAdd<<>>((float(*)[N])d_A, (float(*)[N])d_B, (float(*)[N])d_C, N, work);
    
    // stop the timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float gputime;
    cudaEventElapsedTime(&gputime, start, stop);
    printf("Processing time for GPU: %f (ms) \n", gputime);
    printf("GPU Gflops: %f\n", 3 * N / (1000000.0 * gputime));
    
    // Copy result from device memory to host memory
    // h_C contains the result in host memory

    // start the timer
    cudaEventRecord(start, 0);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // stop the timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float Outime;
    cudaEventElapsedTime(&Outime, start, stop);
    printf("Output time for GPU: %f (ms) \n", Outime);

    float gputime_tot;
    gputime_tot = Intime + gputime + Outime;
    printf("Total time for GPU: %f (ms) \n", gputime_tot);

    // start the timer
    cudaEventRecord(start, 0);

    // to compute the reference solution
    for(int i = 0; i < N; ++i)
        for(int j = 0; j < N; ++j)
            h_D[i][j] = 1.0 / h_A[i][j] + 1.0 / h_B[i][j];
    
    // stop the timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float cputime;
    cudaEventElapsedTime(&cputime, start, stop);
    printf("Processing time for CPU: %f (ms) \n", cputime);
    printf("CPU Gflops: %f\n", 3 * N / (1000000.0 * cputime));
    printf("Speed up of GPU = %f\n", cputime / (gputime_tot));

    // destroy the timer
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // check result

    printf("Check result:\n");
    double sum = 0; 
    double diff;
    for(int i = 0; i < N; ++i){
        for(int j = 0; j < N; ++j){
            diff = abs(h_D[i][j] - h_C[i][j]);
            sum += diff * diff; 
        }
    }
    sum = sqrt(sum);
    printf("norm(h_C - h_D) = %20.15e\n\n", sum);

    printf("%d,%d,%d,%d,%d,%f,%f,%f,%f,%f,%f,%f,%f,%f", 
        work, 
        blockDimy, 
        blockDimy,
        blocksPerGrid.x,
        blocksPerGrid.y,
        Intime,
        gputime,
        3 * N / (1000000.0 * gputime),
        Outime,
        gputime_tot,
        cputime,
        3 * N / (1000000.0 * cputime),
        cputime / (gputime_tot),
        sum
        );
    cudaDeviceReset();
}


// Allocates an array with random float entries.
void RandomInit(float data[N][N])
{
    for(int i = 0; i < N; ++i)
        for(int j = 0; j < N; ++j)
            data[i][j] = rand() / (float)RAND_MAX;
}