/* Programmer: Mark Fienup File: count3s.cu Compile As: nvcc -o count3s_sm_13 -arch=sm_13 count3s_reduction.cu Note: -arch=sm_11 need to for atomicAdd Need to run on Cuda 1.3 Compute Capability devices Description: A CUDA solution to the Count 3s problem */ #define SIZE (512*512*32) #define threadsPerBlock 512 #include #include #include static void HandleError( cudaError_t err, const char *file, int line ) { if (err != cudaSuccess) { printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); exit( EXIT_FAILURE ); } } #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) // Takes 20.3 ms on Tesla C2070 on fermi1 (device #2) (29.6 ms serially on host) // Takes 281.1 ms on Tesla C1060 on fermi1 (device #1) (29.6 ms serially on host) __global__ void count3s_kernelA(int * dev_array, int length, int * devCount) { int i = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; while (i < length) { if (dev_array[i] == 3 ) { atomicAdd( devCount, 1); } // end if i = i + stride; } // end while } // end count3s_kernelA // Tries to reduce contention for the devCount by having a "local" count called blockCount in the // the shared memory of a block. Requires 1.2 compute capability. // Takes 10.4 ms on Tesla C2070 on fermi1 (device #2) (29.6 ms serially on host) // Takes 9.8 ms on Tesla C1060 on fermi1 (device #1) (29.6 ms serially on host) __global__ void count3s_kernelB(int * dev_array, int length, int * devCount) { __shared__ int blockCount; if (threadIdx.x == 0) { blockCount = 0; } // end if __syncthreads(); int i = threadIdx.x + blockIdx.x * blockDim.x; int offset = blockDim.x * gridDim.x; while (i < length) { if (dev_array[i] == 3 ) { atomicAdd( &blockCount, 1); } // end if i = i + offset; } // end while __syncthreads(); if (threadIdx.x == 0) { atomicAdd(devCount, blockCount); } // end if } // end count3s_kernelB // Tries to reduce contention for the devCount by having a "local" count per block // Tries to reduce contention within a block by having each thread maintain their own count, then // doing a binary-tree reduction in the shared memory of a block. Requires 1.2 compute capability. // Takes 8.3 ms on Tesla C2070 on fermi1 (device #2) (29.6 ms serially on host) // Takes 7.1 ms on Tesla C1060 on fermi1 (device #1) (29.6 ms serially on host) __global__ void count3s_kernelC(int * dev_array, int length, int * devCount) { __shared__ int threadCounts[threadsPerBlock]; int i = threadIdx.x + blockIdx.x * blockDim.x; int offset = blockDim.x * gridDim.x; int threadCount = 0; while (i < length) { if (dev_array[i] == 3 ) { threadCount += 1; } // end if i = i + offset; } // end while threadCounts[threadIdx.x] = threadCount; __syncthreads(); // binary-tree reduction, threadsPerBlock must be a power of 2 i = blockDim.x/2; while (i != 0) { if (threadIdx.x < i) { threadCounts[threadIdx.x] += threadCounts[threadIdx.x + i]; } // end if __syncthreads(); i = i / 2; } // end while if (threadIdx.x == 0) { atomicAdd(devCount, threadCounts[0]); } // end if } // end count3s_kernelC int main(int argc, char* argv[]) { int sequentialCount, i, length; int * myArray; // fermi1 device #1 is Tesla C1060 has 1.3 Compute Capability on fermi1 // cudaDeviceProp prop; // HANDLE_ERROR(cudaGetDeviceProperties( &prop, 1)); // HANDLE_ERROR(cudaSetDevice(1)); // fermi1 device #2 is Tesla C2070 has 2.0 Compute Capability on fermi1 HANDLE_ERROR(cudaGetDeviceProperties( &prop, 2)); HANDLE_ERROR(cudaSetDevice(2)); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // Generate data array with 10% 3s length = SIZE; printf("length = %d\n", length); myArray=(int *) malloc(length*sizeof(int)); srand(5); for (i=0; i < length; i++) { myArray[i] = rand() % 10; } // end for i /* Do the actual work sequentially */ cudaEventRecord(start,0); sequentialCount = 0; for (i=0; i < length; i++) { if(myArray[i] == 3) { sequentialCount++; } // end if } // end for i cudaEventRecord(stop, 0); cudaEventSynchronize( stop); float elapsedTime; cudaEventElapsedTime( &elapsedTime, start, stop); printf( "Time to count 3s on host: %3.1f ms\n", elapsedTime); printf("Number of 3's: %d\n", sequentialCount); // Do the work on GPU // allocate memory on the GPU for the data cudaEventRecord(start,0); int * dev_array; int * dev_count; cudaMalloc((void**) &dev_array, length*sizeof(int)); cudaMemcpy(dev_array, myArray, sizeof(int)*length, cudaMemcpyHostToDevice); cudaMalloc((void**) &dev_count, sizeof(int)); cudaMemset(dev_count, 0, sizeof(int)); // Determine device properties int blocks = prop.multiProcessorCount; count3s_kernelA<<>>(dev_array, length, dev_count); int devCount; cudaMemcpy(&devCount, dev_count, sizeof(int), cudaMemcpyDeviceToHost); cudaEventRecord(stop, 0); cudaEventSynchronize( stop); cudaEventElapsedTime( &elapsedTime, start, stop); printf( "Time to count 3s on CUDA device: %3.1f ms\n", elapsedTime); if (sequentialCount == devCount) { printf("Results match at %d 3s!\n", devCount); } else { printf("Results wrong with seq. count %d and GPU count %d.\n", sequentialCount, devCount); } // end if cudaEventDestroy( start ); cudaEventDestroy( stop ); cudaFree( dev_count ); cudaFree( dev_array ); free(myArray); return 0; } /* end main */