////////////////////////////////////////////////////////////////////////////
// Estimate PI on GPU
////////////////////////////////////////////////////////////////////////////

// shared memory for reduction
extern __shared__ unsigned int s_data[];

__global__ void piEstimateGPU(float *x,float *y, unsigned int *counters, int N)
{
	// global index
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	// index within block
	int tid = threadIdx.x;

	// perform computation
	s_data[tid] = 0;

	if (idx<N)
	{	
		float xx = x[idx];
		float yy = y[idx];
		float norm = xx*xx + yy*yy;
		if (norm < 1.0f)
			s_data[tid] = 1;
	}
	else
		s_data[tid] = 0;

	__syncthreads();

	// do reduction in shared mem
	for(unsigned int s=blockDim.x/2; s>0; s>>=1) 
	{
        	if (tid < s) 
	        {
	            s_data[tid] += s_data[tid + s];
        	}
        	__syncthreads();
    	}
       // write result for this block to global mem
       if (tid == 0) 
               counters[blockIdx.x] = s_data[0];
	
}
