////////////////////////////////////////////////////////////////////////////
// Calculate dot products on GPU
////////////////////////////////////////////////////////////////////////////

__global__ void dotProdGPU0(float *c, float *a,float *b, int N)
{
	// global index
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	if (idx<N)
		c[idx] = a[idx] * b[idx];		
}

__global__ void dotProdGPU1(float *c,float *a,float *b, int N)
{
	// global index
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	// index within block
	int tid = threadIdx.x;
	// shared memory for reduction
	extern __shared__ float s_data[];

	// perform computation
	if (idx<N)
		s_data[tid] = a[idx] * b[idx];
	
	__syncthreads();

	// do reduction in shared mem
	for(unsigned int s=1; s<blockDim.x; s*=2) 
	{
        	if ((tid % (2*s)) == 0) 
	        {
	            s_data[tid] += s_data[tid + s];
        	}
        	__syncthreads();
    	}
       // write result for this block to global mem
       if (tid == 0) 
               c[blockIdx.x] = s_data[0];
	
}

__global__ void dotProdGPU2(float *c,float *a,float *b, int N)
{
	// global index
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	// index within block
	int tid = threadIdx.x;
	// shared memory for reduction
	extern __shared__ float s_data[];

	// perform computation
	if (idx<N)
		s_data[tid] = a[idx] * b[idx];
	
	__syncthreads();

	// do reduction in shared mem
	for(unsigned int s=1; s<blockDim.x; s*=2) 
	{
		int index = 2*s*tid;

        	if (index < blockDim.x) 
	        {
	            s_data[index] += s_data[index + s];
        	}
        	__syncthreads();
    	}
       // write result for this block to global mem
       if (tid == 0) 
               c[blockIdx.x] = s_data[0];
	
}

__global__ void dotProdGPU3(float *c,float *a,float *b, int N)
{
	// global index
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	// index within block
	int tid = threadIdx.x;
	// shared memory for reduction
	extern __shared__ float s_data[];

	// perform computation
	if (idx<N)
		s_data[tid] = a[idx] * b[idx];
	
	__syncthreads();

	// do reduction in shared mem
	for(unsigned int s=blockDim.x/2; s>0; s>>=1) 
	{
        	if (tid < s) 
	        {
	            s_data[tid] += s_data[tid + s];
        	}
        	__syncthreads();
    	}
       // write result for this block to global mem
       if (tid == 0) 
               c[blockIdx.x] = s_data[0];
	
}
