ex2b_a.cu (3085B)
1 #include <stdio.h> 2 #include <time.h> 3 4 #define N (1 << 2) 5 #define DIM (N * N) 6 /* 7 * This formula for calculating the number of blocks is mentioned at "out of 8 * the blocks" section in: 9 * 10 * https://developer.nvidia.com/blog/even-easier-introduction-cuda/ 11 */ 12 #define BLKSIZE (1 << 8) 13 #define NBLK ((DIM + BLKSIZE - 1) / BLKSIZE) 14 15 __global__ void 16 convolution(float *a, float *aconv) 17 { 18 float c11, c12, c13, c21, c22, c23, c31, c32, c33; 19 int i, j, x, stridex; 20 21 /* 22 * Each thread gets a slice of the rows to work with. Grid-stride idiom 23 * mentioned at section "out of the blocks" in: 24 * 25 * https://developer.nvidia.com/blog/even-easier-introduction-cuda/ 26 */ 27 x = blockIdx.x * blockDim.x + threadIdx.x; 28 stridex = blockDim.x * gridDim.x; 29 30 /* Random weight values */ 31 c11 = +0.2; c21 = +0.5; c31 = -0.8; 32 c12 = -0.3; c22 = +0.6; c32 = -0.9; 33 c13 = +0.4; c23 = +0.7; c33 = +0.10; 34 35 if (x < 1 || x > N - 1) 36 return; 37 for (i = x; i < N - 1; i += stridex) { 38 for (j = 1; j < N - 1; j++) { 39 /* Taken from the lab's example code. */ 40 aconv[i * N + j] = 41 c11 * a[(i - 1) * N + (j - 1)] + 42 c12 * a[i * N + (j - 1)] + 43 c13 * a[(i + 1) * N + (j - 1)] + 44 c21 * a[(i - 1) * N + j] + 45 c22 * a[i * N + j] + 46 c23 * a[(i + 1) * N + j] + 47 c31 * a[(i - 1) * N + (j + 1)] + 48 c32 * a[i * N + (j + 1)] + 49 c33 * a[(i + 1) * N + (j + 1)]; 50 } 51 } 52 } 53 54 __global__ void 55 min_diagonal(float *arr, float *min_arr) 56 { 57 int x, stridex, i; 58 59 x = blockIdx.x * blockDim.x + threadIdx.x; 60 stridex = blockDim.x * gridDim.x; 61 62 if (x >= N) 63 return; 64 /* Calculate local minimums */ 65 min_arr[x] = arr[x * N + x]; 66 for (i = x; i < N; i += stridex) 67 if (arr[i * N + i] < min_arr[x]) 68 min_arr[x] = arr[i * N + i]; 69 } 70 71 static void 72 pretty_print(float *arr, const char *name) 73 { 74 int i, j; 75 76 printf("\n%s = [\n", name); 77 for (i = 0; i < N; i++) { 78 printf("\t["); 79 for (j = 0; j < N; j++) { 80 printf("%.2f%s", arr[i * N + j], 81 (j == N - 1) ? "]\n" : ", "); 82 } 83 } 84 printf("]\n"); 85 } 86 87 int 88 main(int argc, char *argv[]) 89 { 90 float *a, *aconv, *min_arr, min; 91 int i; 92 93 srand(time(NULL)); 94 95 /* 96 * Use unified memory to avoid having additional device arrays and 97 * memcpying from host to device and vice versa. 98 * 99 * https://developer.nvidia.com/blog/unified-memory-cuda-beginners/ 100 */ 101 cudaMallocManaged(&a, DIM * sizeof(float)); 102 cudaMallocManaged(&aconv, DIM * sizeof(float)); 103 cudaMallocManaged(&min_arr, DIM * sizeof(float)); 104 105 /* Initialize array */ 106 for (i = 0; i < DIM; i++) 107 a[i] = (float)(rand() % 100); 108 109 convolution<<<NBLK, BLKSIZE>>>(a, aconv); 110 /* Wait for all devices to finish */ 111 cudaDeviceSynchronize(); 112 113 min_diagonal<<<NBLK, BLKSIZE>>>(aconv, min_arr); 114 cudaDeviceSynchronize(); 115 116 /* 117 * Find global minimum using the local minimums calculated in 118 * min_diagonal(). 119 */ 120 min = min_arr[0]; 121 for (i = 0; i < N; i++) 122 if (min_arr[i] < min) 123 min = min_arr[i]; 124 125 pretty_print(a, "A"); 126 pretty_print(aconv, "A_conv"); 127 printf("Min_diagonal(A_conv): %.2f\n", min); 128 129 cudaFree(a); 130 cudaFree(aconv); 131 cudaFree(min_arr); 132 133 return (0); 134 }