/*  Programmer:  Mark Fienup
    File:        count3s.cu
	Compile As:  nvcc -o count3s_sm_13 -arch=sm_13 count3s_reduction.cu
	Note:  -arch=sm_11 need to for atomicAdd
	Need to run on Cuda 1.3 Compute Capability devices
    Description:  A CUDA solution to the Count 3s problem
*/

#define SIZE (512*512*32)
#define threadsPerBlock 512

#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

// Takes 20.3 ms on Tesla C2070 on fermi1 (device #2) (29.6 ms serially on host)
// Takes 281.1 ms on Tesla C1060 on fermi1 (device #1) (29.6 ms serially on host)
__global__ void count3s_kernelA(int * dev_array, int length, int * devCount) {
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	int stride = blockDim.x * gridDim.x;
	while (i < length) {
		if (dev_array[i] == 3 ) {
			atomicAdd( devCount, 1);
		} // end if
		i = i + stride;
	} // end while
} // end count3s_kernelA


// Tries to reduce contention for the devCount by having a "local" count called blockCount in the
// the shared memory of a block.  Requires 1.2 compute capability. 
// Takes 10.4 ms on Tesla C2070 on fermi1 (device #2) (29.6 ms serially on host)
// Takes 9.8 ms on Tesla C1060 on fermi1 (device #1) (29.6 ms serially on host)
__global__ void count3s_kernelB(int * dev_array, int length, int * devCount) {
	__shared__ int blockCount;
	if (threadIdx.x == 0) {
		blockCount = 0;
	} // end if
	__syncthreads();

	int i = threadIdx.x + blockIdx.x * blockDim.x;
	int offset = blockDim.x * gridDim.x;

	while (i < length) {
		if (dev_array[i] == 3 ) {
			atomicAdd( &blockCount, 1);
		} // end if
		i = i + offset;
	} // end while
	__syncthreads();
	if (threadIdx.x == 0) {
		atomicAdd(devCount, blockCount);
	} // end if
} // end count3s_kernelB


// Tries to reduce contention for the devCount by having a "local" count per block
// Tries to reduce contention within a block by having each thread maintain their own count, then
// doing a binary-tree reduction in the shared memory of a block.  Requires 1.2 compute capability. 
// Takes 8.3 ms on Tesla C2070 on fermi1 (device #2) (29.6 ms serially on host)
// Takes 7.1 ms on Tesla C1060 on fermi1 (device #1) (29.6 ms serially on host)
__global__ void count3s_kernelC(int * dev_array, int length, int * devCount) {
	__shared__ int threadCounts[threadsPerBlock];

	int i = threadIdx.x + blockIdx.x * blockDim.x;
	int offset = blockDim.x * gridDim.x;
	int threadCount = 0;

	while (i < length) {
		if (dev_array[i] == 3 ) {
			threadCount += 1;
		} // end if
		i = i + offset;
	} // end while

	threadCounts[threadIdx.x] = threadCount;
	__syncthreads();

	// binary-tree reduction, threadsPerBlock must be a power of 2
	i = blockDim.x/2;
	while (i != 0) {
		if (threadIdx.x < i) {
			threadCounts[threadIdx.x] += threadCounts[threadIdx.x + i];
		} // end if
		__syncthreads();
		i = i / 2;
	} // end while

	if (threadIdx.x == 0) {
		atomicAdd(devCount, threadCounts[0]);
	} // end if
} // end count3s_kernelC

int main(int argc, char* argv[]) {
    int sequentialCount, i, length;
    int * myArray;

// fermi1 device #1 is Tesla C1060 has 1.3 Compute Capability on fermi1
//	cudaDeviceProp prop;
//	HANDLE_ERROR(cudaGetDeviceProperties( &prop, 1));
//	HANDLE_ERROR(cudaSetDevice(1));  
// fermi1 device #2 is Tesla C2070 has 2.0 Compute Capability on fermi1
	HANDLE_ERROR(cudaGetDeviceProperties( &prop, 2));
	HANDLE_ERROR(cudaSetDevice(2));  

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// Generate data array with 10% 3s
	length = SIZE;
	printf("length = %d\n", length);
    myArray=(int *) malloc(length*sizeof(int));
	srand(5);
    for (i=0; i < length; i++) {
		myArray[i] = rand() % 10;
    } // end for i


    /* Do the actual work sequentially */
	cudaEventRecord(start,0);
	sequentialCount = 0;
	for (i=0; i < length; i++) {
        if(myArray[i] == 3) {
            sequentialCount++;
        } // end if
    } // end for i
	cudaEventRecord(stop, 0);
	cudaEventSynchronize( stop);
	float elapsedTime;
	cudaEventElapsedTime( &elapsedTime, start, stop);
	printf( "Time to count 3s on host: %3.1f ms\n", elapsedTime);
    printf("Number of 3's: %d\n", sequentialCount);

	// Do the work on GPU
	// allocate memory on the GPU for the data
	cudaEventRecord(start,0);
	int * dev_array;
	int * dev_count;
	cudaMalloc((void**) &dev_array, length*sizeof(int));
	cudaMemcpy(dev_array, myArray, sizeof(int)*length, cudaMemcpyHostToDevice);
	cudaMalloc((void**) &dev_count, sizeof(int));
	cudaMemset(dev_count, 0, sizeof(int));

	// Determine device properties
	int blocks = prop.multiProcessorCount;
	count3s_kernelA<<<blocks*2,threadsPerBlock>>>(dev_array, length, dev_count);

	int devCount;
	cudaMemcpy(&devCount, dev_count, sizeof(int), cudaMemcpyDeviceToHost);

	cudaEventRecord(stop, 0);
	cudaEventSynchronize( stop);

	cudaEventElapsedTime( &elapsedTime, start, stop);
	printf( "Time to count 3s on CUDA device: %3.1f ms\n", elapsedTime);

	if (sequentialCount == devCount) {
		printf("Results match at %d 3s!\n", devCount);
	} else {
		printf("Results wrong with seq. count %d and GPU count %d.\n", sequentialCount, devCount);
	} // end if

	cudaEventDestroy( start );
	cudaEventDestroy( stop );
	cudaFree( dev_count );
	cudaFree( dev_array );

	free(myArray);

	return 0;
} /* end main */