#include <time.h>
#include <stdio.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>
#include <device_functions.h>

#if ( __CUDACC_VER_MAJOR__ >=7 )
    #include <helper_cuda.h>
    #include <helper_functions.h>
//    #define cutilCheckError(call) checkCudaErrors(call)
    #define cudaStopWatchInterface StopWatchInterface *
    #define cutilCheckError(call) call
    #define cutilSafeCall(call) call
    #define cutCreateTimer(x) sdkCreateTimer(x)
    #define cutResetTimer(x) sdkResetTimer(&x)
    #define cutStartTimer(x) sdkStartTimer(&x)
    #define cutStopTimer(x) sdkStopTimer(&x)
    #define cutGetTimerValue(x) sdkGetTimerValue(&x)
    #define cutilDeviceSynchronize cudaDeviceSynchronize
    #define cutiliTESTSafeCall(call) \
    do { \
        cudaError_t err = call; \
        if (cudaSuccess != err) { \
           fprintf (stderr, "Cuda error in file '%s' in line %i : %s.",  \
                 __FILE__, __LINE__, cudaGetErrorString(err) ); \
           exit(EXIT_FAILURE); \
       } \
    } while (0)
#else
    #include <cutil_inline.h>
    #include <sm_11_atomic_functions.h>
    #define cudaStopWatchInterface uint
#endif
typedef unsigned char uchar;
typedef unsigned int  uint;

#define HISTOGRAM_BIN_COUNT 256
#define N 1024
#define HISTOGRAM_BLOCK_SIZE 64
#define WARP_COUNT 6
#define LOG2_WARP_SIZE 5U
#define PHCOUNT 240
#define NumRuns 10000

__global__ void histogram0(uint* histogram, uchar* color, int size)
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    if(i > 0) return;
    for (uint k = 0; k < size; ++k) (histogram[color[k]])++;
}

__global__ void histogram1(uint* histogram, uchar* color, int size)
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    if(i >= size) return;
    uchar c = color[i]; 
    atomicAdd(&histogram[c], 1);
}

#define MERGE_THREADBLOCK_SIZE 256
__global__ void histogram2(uint* histogram, uchar* color, int size)
{
    __shared__ uint data[HISTOGRAM_BIN_COUNT];

    int stride = blockDim.x;
    for (int i = threadIdx.x; i < HISTOGRAM_BIN_COUNT; i += stride)
        data[i] = 0;
    __syncthreads();
    
    stride = blockDim.x * gridDim.x;
    for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < size; i += stride)
        atomicAdd( &data[color[i]], 1);
    __syncthreads();

    stride = MERGE_THREADBLOCK_SIZE;
    atomicAdd( &(histogram[threadIdx.x]), data[threadIdx.x] );
}

__global__ void histogram3(uint* histogram, uchar* color, int size)
{
    __shared__ uint data[HISTOGRAM_BIN_COUNT];

    // I n i t i a l i z a t i o n
    int stride = blockDim.x;
    for (int i = threadIdx.x; i < HISTOGRAM_BIN_COUNT; i += stride)
        data[i] = 0;
    __syncthreads();

    // C a l c u l a t e   p r i v a t e   h i s t o g r a m
    stride = blockDim.x * gridDim.x;
    for (uint i = threadIdx.x + blockDim.x * blockIdx.x; i < size; i += stride)
        atomicAdd( &data[color[i]], 1);
    __syncthreads();

    // U p d a t e   g l o b a l   h i s t o g r a m
    stride = blockDim.x;
    for (uint i = threadIdx.x; i < HISTOGRAM_BIN_COUNT; i += stride)
        atomicAdd( &(histogram[i]), data[i] );
}

__global__ void histogram5(uint* histogram, uchar* color, int size)
{
    __shared__ uint data[HISTOGRAM_BIN_COUNT];
    data[threadIdx.x] = 0;
    __syncthreads();

    for (uint i = threadIdx.x + blockDim.x * blockIdx.x; i < size; i += blockDim.x * gridDim.x)
        atomicAdd( &data[color[i]], 1);
    __syncthreads();

    atomicAdd( &(histogram[threadIdx.x]), data[threadIdx.x] );
}

__global__ void histogram4a(uint* dPartialHistograms, uchar* color, int size)
{
    __shared__ uint sHist[HISTOGRAM_BIN_COUNT * WARP_COUNT];
    uint *s_WarpHist= sHist + (threadIdx.x >> LOG2_WARP_SIZE) * HISTOGRAM_BIN_COUNT;

    //Clear shared memory storage for current threadblock before processing
    for(uint i = 0; i < ((HISTOGRAM_BIN_COUNT * WARP_COUNT) / HISTOGRAM_BLOCK_SIZE); i++)
       sHist[threadIdx.x + i * HISTOGRAM_BLOCK_SIZE] = 0;

    __syncthreads();
    uint stride = blockDim.x * gridDim.x;
    for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += stride)
       atomicAdd(&s_WarpHist[color[pos]], 1);

    //Merge per-warp histograms into per-block and write to global memory
    __syncthreads();
    for(uint bin = threadIdx.x; bin < HISTOGRAM_BIN_COUNT; bin += HISTOGRAM_BLOCK_SIZE){
        uint sum = 0;

        for(uint i = 0; i < WARP_COUNT; i++)
            sum += sHist[bin + i * HISTOGRAM_BIN_COUNT];

        dPartialHistograms[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum;
    }
}

//#define MERGE_BLOCK_SIZE 256
#define MERGE_BLOCK_SIZE 128
__global__ void histogram4b(uint* histogram, uint* dPartialHistograms, int histogramCount)
{
    __shared__ uint data[MERGE_BLOCK_SIZE];
    uint sum = 0;
    int stride = MERGE_BLOCK_SIZE;
    for (int i = threadIdx.x; i < histogramCount; i += stride) 
       sum += dPartialHistograms[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
    data[threadIdx.x] = sum;

    for (uint k = MERGE_BLOCK_SIZE / 2; k > 0; k >>=1) {
       __syncthreads();
       if (threadIdx.x < k)
          data[threadIdx.x] += data[threadIdx.x + k];
    }

    if (threadIdx.x == 0)
       histogram[blockIdx.x] = data[0];
}


int main() {
    uchar* hColor = (uchar*)malloc(N * sizeof(uchar));
    uint* hHistogram1 = (uint*)malloc(HISTOGRAM_BIN_COUNT * sizeof(uint));
    uint* hHistogram2 = (uint*)malloc(HISTOGRAM_BIN_COUNT * sizeof(uint));
    uint* hHistogram3 = (uint*)malloc(HISTOGRAM_BIN_COUNT * sizeof(uint));
    uint* hHistogram4 = (uint*)malloc(HISTOGRAM_BIN_COUNT * sizeof(uint));
    uint* hHistogram5 = (uint*)malloc(HISTOGRAM_BIN_COUNT * sizeof(uint));
    double dAvgSecs;
//    StopWatchInterface *hTimer = NULL;
    cudaStopWatchInterface hTimer = 0;
    dim3 threads, blocks, threads2, blocks2;
    cutilCheckError(cutCreateTimer(&hTimer));

    srand(2017);
    for (uint i = 0; i < N; ++i) hColor[i] = (uchar)(rand() % 256);
//    for (uint i = 0; i < N; ++i) printf("%d ", hColor[i]); printf("\n");

    uchar* dColor;
    uint *dPartialHistograms;
    cudaMalloc(&dColor, N * sizeof(uchar));
    cudaMemcpy(dColor, hColor, N * sizeof(uchar), cudaMemcpyHostToDevice);
    cudaMalloc(&dPartialHistograms, PHCOUNT * HISTOGRAM_BIN_COUNT * sizeof(uint));

    uint* dHistogram;
    cudaMalloc(&dHistogram, HISTOGRAM_BIN_COUNT * sizeof(uint));

    // USING GLOBAL MEMORY
    threads.x = 32;
    blocks.x = (N + threads.x - 1) / threads.x;
    printf("threads=%d  blocks=%d\n",threads.x,blocks.x);
    for (int iter = -1; iter < NumRuns; iter++) {
        //iter == -1 -- warmup iteration
        if (iter == 0) {
//            cutilSafeCall( cutilDeviceSynchronize() );
            cutilDeviceSynchronize();
            cutilCheckError( cutResetTimer(hTimer) );
            cutilCheckError( cutStartTimer(hTimer) );
        }
        cudaMemset(dHistogram,(unsigned char)0, HISTOGRAM_BIN_COUNT * sizeof(uint));
        histogram1<<<blocks,threads>>>(dHistogram, dColor, N);
    }
    cutilSafeCall( cudaMemcpy(hHistogram1, dHistogram, HISTOGRAM_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );
//    cutilSafeCall( cutilDeviceSynchronize() );
    cutilDeviceSynchronize();
    cutilCheckError( cutStopTimer(hTimer) );
    dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer);
    printf("histogram1() time (cumulative) : %.5f sec\n", dAvgSecs);
    for (uint i = 0; i < HISTOGRAM_BIN_COUNT; ++i) printf("%d ", hHistogram1[i]); printf("\n");

    // USING SHARED MEMORY
//    for (uint i = 0; i < N; ++i) printf("%d ", hColor[i]); printf("\n");
    threads.x = HISTOGRAM_BLOCK_SIZE;
    blocks.x = PHCOUNT;
    threads2.x = MERGE_BLOCK_SIZE;
    blocks2.x = HISTOGRAM_BIN_COUNT;
    printf("threads=%d  blocks=%d\n",threads.x,blocks.x);
    for (int iter = -1; iter < NumRuns; iter++) {
        //iter == -1 -- warmup iteration
        if (iter == 0) {
//            cutilSafeCall( cutilDeviceSynchronize() );
            cutilDeviceSynchronize();
            cutilCheckError( cutResetTimer(hTimer) );
            cutilCheckError( cutStartTimer(hTimer) );
        }
        histogram4a<<<blocks,threads>>>(dPartialHistograms, dColor, N);
        histogram4b<<<blocks2,threads2>>>(dHistogram, dPartialHistograms, PHCOUNT);
    }
//    cutilSafeCall( cutilDeviceSynchronize() );
    cutilDeviceSynchronize();
    cutilCheckError(  cutStopTimer(hTimer));
    dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer);
    printf("histogram4() time (cumulative) : %.5f sec\n", dAvgSecs);
    cutilSafeCall( cudaMemcpy(hHistogram4, dHistogram, HISTOGRAM_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost));
    for (int i = 0; i < HISTOGRAM_BIN_COUNT; ++i) printf("%d ", hHistogram4[i]); printf("\n");

    int errors=0;
    for (int i = 0; i < HISTOGRAM_BIN_COUNT; ++i)
        if (hHistogram1[i] != hHistogram4[i]) { errors=1; break; }
    if (errors == 1) printf("ERRORS!\n"); else printf("Goodbye4!\n");

    // USING SHARED MEMORY v5
//    for (uint i = 0; i < N; ++i) printf("%d ", hColor[i]); printf("\n");
    threads.x = HISTOGRAM_BIN_COUNT;
    blocks.x = (N + threads.x - 1) / threads.x;
printf("threads=%d  blocks=%d\n",threads.x,blocks.x);

    for (int iter = -1; iter < NumRuns; iter++) {
        //iter == -1 -- warmup iteration
        if (iter == 0) {
//            cutilSafeCall( cutilDeviceSynchronize() );
            cutilDeviceSynchronize();
            cutilCheckError( cutResetTimer(hTimer) );
            cutilCheckError( cutStartTimer(hTimer) );
        }
        cudaMemset(dHistogram,(unsigned char)0, HISTOGRAM_BIN_COUNT * sizeof(uint));
        histogram5<<<blocks,threads>>>(dHistogram, dColor, N);
    }
//    cutilSafeCall( cutilDeviceSynchronize() );
    cutilDeviceSynchronize();
    cutilCheckError(  cutStopTimer(hTimer));
    dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer);
    printf("histogram5() time (cumulative) : %.5f sec\n", dAvgSecs);
    cutilSafeCall( cudaMemcpy(hHistogram5, dHistogram, HISTOGRAM_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost));
    for (int i = 0; i < HISTOGRAM_BIN_COUNT; ++i) printf("%d ", hHistogram5[i]); printf("\n");

    errors=0;
    for (int i = 0; i < HISTOGRAM_BIN_COUNT; ++i)
        if (hHistogram5[i] != hHistogram1[i]) { errors=1; break; }
    if (errors == 1) printf("ERRORS!\n"); else printf("Goodbye5!\n");

    // USING SHARED MEMORY v3
//    for (uint i = 0; i < N; ++i) printf("%d ", hColor[i]); printf("\n");
    threads.x = 1024;
    threads.x = 512;
    threads.x = 256;
    threads.x = 128;
    threads.x = 64;
    blocks.x = (N + threads.x - 1) / threads.x;
    printf("threads=%d  blocks=%d N=%d\n",threads.x,blocks.x,N);

    for (int iter = -1; iter < NumRuns; iter++) {
        //iter == -1 -- warmup iteration
        if (iter == 0) {
//            cutilSafeCall( cutilDeviceSynchronize() );
            cutilDeviceSynchronize();
            cutilCheckError( cutResetTimer(hTimer) );
            cutilCheckError( cutStartTimer(hTimer) );
        }
        cudaMemset(dHistogram,(unsigned char)0, HISTOGRAM_BIN_COUNT * sizeof(uint));
        histogram3<<<blocks,threads>>>(dHistogram, dColor, N);
    }
//    cutilSafeCall( cutilDeviceSynchronize() );
    cutilDeviceSynchronize();
    cutilCheckError(  cutStopTimer(hTimer));
    dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer);
    printf("histogram3() time (cumulative) : %.5f sec\n", dAvgSecs);
    cutilSafeCall( cudaMemcpy(hHistogram3, dHistogram, HISTOGRAM_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost));
    for (int i = 0; i < HISTOGRAM_BIN_COUNT; ++i) printf("%d ", hHistogram3[i]); printf("\n");

    errors=0;
    for (int i = 0; i < HISTOGRAM_BIN_COUNT; ++i)
        if (hHistogram3[i] != hHistogram1[i]) { errors=1; break; }
    if (errors == 1) printf("ERRORS!\n"); else printf("Goodbye3!\n");

    // USING SHARED MEMORY v2
//    for (uint i = 0; i < N; ++i) printf("%d ", hColor[i]); printf("\n");
    threads.x = MERGE_THREADBLOCK_SIZE;
    blocks.x = (N + threads.x - 1) / threads.x;
    printf("threads=%d  blocks=%d N=%d\n",threads.x,blocks.x,N);

    for (int iter = -1; iter < NumRuns; iter++) {
        //iter == -1 -- warmup iteration
        if (iter == 0) {
//            cutilSafeCall( cutilDeviceSynchronize() );
            cutilDeviceSynchronize();
            cutilCheckError( cutResetTimer(hTimer) );
            cutilCheckError( cutStartTimer(hTimer) );
        }
        cudaMemset(dHistogram,(unsigned char)0, HISTOGRAM_BIN_COUNT * sizeof(uint));
        histogram2<<<blocks,threads>>>(dHistogram, dColor, N);
    }
//    cutilSafeCall( cutilDeviceSynchronize() );
    cutilDeviceSynchronize();
    cutilCheckError(  cutStopTimer(hTimer));
    dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer);
    printf("histogram2() time (cumulative) : %.5f sec\n", dAvgSecs);
    cutilSafeCall( cudaMemcpy(hHistogram2, dHistogram, HISTOGRAM_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost));
    for (int i = 0; i < HISTOGRAM_BIN_COUNT; ++i) printf("%d ", hHistogram2[i]); printf("\n");

    errors=0;
    for (int i = 0; i < HISTOGRAM_BIN_COUNT; ++i)
        if (hHistogram2[i] != hHistogram1[i]) { errors=1; break; }
    if (errors == 1) printf("ERRORS!\n"); else printf("Goodbye2!\n");

    cudaFree(dColor);
    cudaFree(dHistogram);
    free(hColor);
    free(hHistogram1);
    free(hHistogram2);
    free(hHistogram3);
    free(hHistogram4);
    free(hHistogram5);
}
