// Includes
#include 
#include 

// Defines
#define RAND_MIN -1.0

// Variables
float* h_A;   // host vectors
float* h_B;
float* d_A;   // device vectors
float* d_B;

// Functions
void RandomInit(float*, int);

// Device code
__global__ void VecMax(const float* A, float* B, int N)
{
    extern __shared__ float cache[];

    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int cacheIndex = threadIdx.x;

    float temp = RAND_MIN;  // register for each thread
    while (i < N) {
        if(A[i] > temp)
            temp = A[i];
        i += blockDim.x * gridDim.x;  
    }
   
    cache[cacheIndex] = temp;   // set the cache value 

    __syncthreads();

    // perform parallel reduction, threadsPerBlock must be 2^m

    int ib = blockDim.x / 2;
    while (ib != 0) {
      if(cacheIndex < ib && cache[cacheIndex + ib] > cache[cacheIndex])
        cache[cacheIndex] = cache[cacheIndex + ib]; 

      __syncthreads();

      ib /= 2;
    }
    
    if(cacheIndex == 0)
      B[blockIdx.x] = cache[0];
}

// Host code

int main(void)
{

    int gid = 0;

    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;

    err = cudaSetDevice(gid);
    if (err != cudaSuccess) {
        printf("!!! Cannot select GPU with device ID = %d\n", gid);
        exit(1);
    }
    printf("Set GPU with device ID = %d\n", gid);

    cudaSetDevice(gid);

    printf("Maximum of an array: A\n");
    int N;

    printf("Enter the size of the vectors: ");
    scanf("%d", &N);        
    printf("%d\n", N);        

    // Set the sizes of threads and blocks

    int threadsPerBlock;
    printf("Enter the number (2^m) of threads per block: ");
    scanf("%d", &threadsPerBlock);
    printf("%d\n", threadsPerBlock);
    if(threadsPerBlock > 1024) {
        printf("The number of threads per block must be less than 1024 ! \n");
        exit(0);
    }

    int blocksPerGrid;
    printf("Enter the number of blocks per grid: ");
    scanf("%d", &blocksPerGrid);
    printf("%d\n", blocksPerGrid);

    if(blocksPerGrid > 2147483647) {
        printf("The number of blocks must be less than 2147483647 ! \n");
        exit(0);
    }

    // Allocate input vectors h_A in host memory

    int size = N * sizeof(float);
    int sb = blocksPerGrid * sizeof(float);

    h_A = (float*)malloc(size);
    h_B = (float*)malloc(sb);     // contains the result of the maximum from each block
    
    // Initialize input vectors

    RandomInit(h_A, N);

    // create the timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // start the timer
    cudaEventRecord(start, 0);

    // Allocate vectors in device memory
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, sb);

    // Copy vectors from host memory to device memory
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    
    // stop the timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float Intime;
    cudaEventElapsedTime(&Intime, start, stop);
    printf("Input time for GPU: %f (ms) \n", Intime);

    // start the timer
    cudaEventRecord(start, 0);

    int sm = threadsPerBlock * sizeof(float);
    VecMax<<>>(d_A, d_B, N);
    
    // stop the timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float gputime;
    cudaEventElapsedTime(&gputime, start, stop);
    printf("Processing time for GPU: %f (ms) \n", gputime);
    printf("GPU Gflops: %f\n", N / (1000000.0 * gputime));
    
    // Copy result from device memory to host memory
    // h_B contains the result of each block in host memory

    // start the timer
    cudaEventRecord(start, 0);

    cudaMemcpy(h_B, d_B, sb, cudaMemcpyDeviceToHost);

    cudaFree(d_A);
    cudaFree(d_B);

    float h_G = 0.0;
    for(int i = 0; i < blocksPerGrid; i++)
        if(h_B[i] > h_G)
            h_G = h_B[i];
    

    // stop the timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float Outime;
    cudaEventElapsedTime(&Outime, start, stop);
    printf("Output time for GPU: %f (ms) \n", Outime);

    float gputime_tot;
    gputime_tot = Intime + gputime + Outime;
    printf("Total time for GPU: %f (ms) \n", gputime_tot);

    // start the timer
    cudaEventRecord(start, 0);

    // to compute the reference solution

    float h_C = RAND_MIN;
    for(int i = 0; i < N; i++)
        if(h_A[i] > h_C)
            h_C =  h_A[i];
    
    // stop the timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float cputime;
    cudaEventElapsedTime(&cputime, start, stop);
    printf("Processing time for CPU: %f (ms) \n", cputime);
    printf("CPU Gflops: %f\n", N / (1000000.0 * cputime));
    printf("Speed up of GPU = %f\n", cputime / (gputime_tot));

    // destroy the timer
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // check result

    printf("Check result:\n");
    float diff = abs((h_C - h_G) / h_C);
    printf("|(h_G - h_C)/h_C|=%20.15e\n", diff);
    printf("h_G = %20.15e\n", h_G);
    printf("h_C = %20.15e\n", h_C);

    printf("%d,%d,%d,%f,%f,%f,%f,%f,%f,%f,%f,%f",
        N,
        threadsPerBlock,
        blocksPerGrid,
        Intime,
        gputime,
        N / (1000000.0 * gputime),
        Outime,
        gputime_tot,
        cputime,
        N / (1000000.0 * cputime),
        cputime / (gputime_tot),
        diff
        );
    cudaDeviceReset();
}


// Allocates an array with random float entries in (-1, 1)
void RandomInit(float* data, int n)
{
    for (int i = 0; i < n; ++i)
        data[i] = rand() / (float)RAND_MAX * 2.0 + RAND_MIN;
}