ex2b_b.cu (2151B)
1 #include <stdio.h> 2 #include <time.h> 3 4 #define N (1 << 3) 5 #define M (1 << 3) 6 #define DIM (N * M) 7 #define BLKSIZE (1 << 10) 8 #define NBLK ((DIM + BLKSIZE - 1) / BLKSIZE) 9 10 /* 11 * Calculations taken from lab's example code. 12 */ 13 __global__ void 14 transnorm(float *a, float *atrans, float *x, float *y) 15 { 16 int i, j, idx, stridex; 17 18 /* Each thread gets a slice of the rows to work with */ 19 idx = blockIdx.x * blockDim.x + threadIdx.x; 20 stridex = blockDim.x * gridDim.x; 21 22 if (idx >= N) 23 return; 24 /* First thread initializes y */ 25 if (threadIdx.x == 0) { 26 for (i = 0; i < M; i++) 27 y[i] = 0; 28 } 29 for (i = idx; i < N; i += stridex) { 30 for (j = 0; j < M; j++) { 31 /* Transpose A */ 32 atrans[j * N + i] = a[i * M + j]; 33 y[j] = atrans[j * M + i] * a[i * M + j] * x[j]; 34 } 35 } 36 } 37 38 static void 39 pretty_print_1d(float *arr, const char *name, int n) 40 { 41 int i; 42 43 printf("\n%s = [", name); 44 for (i = 0; i < n; i++) { 45 printf("%.2f%s", arr[i], 46 (i == n - 1) ? "" : ", "); 47 } 48 printf("]\n"); 49 } 50 51 static void 52 pretty_print_2d(float *arr, const char *name, int w, int h) 53 { 54 int i, j; 55 56 printf("\n%s = [\n", name); 57 for (i = 0; i < w; i++) { 58 printf("\t["); 59 for (j = 0; j < h; j++) { 60 printf("%.2f%s", arr[i * h + j], 61 (j == h - 1) ? "]\n" : ", "); 62 } 63 } 64 printf("]\n"); 65 } 66 67 int 68 main(int argc, char *argv[]) 69 { 70 float *a, *atrans, *x, *y; 71 int i, j; 72 73 srand(time(NULL)); 74 75 /* 76 * Use unified memory to avoid having additional device arrays and 77 * memcpying from host to device and vice versa. 78 */ 79 cudaMallocManaged(&a, DIM * sizeof(float)); 80 cudaMallocManaged(&atrans, DIM * sizeof(float)); 81 cudaMallocManaged(&x, M * sizeof(float)); 82 cudaMallocManaged(&y, M * sizeof(float)); 83 84 /* Initialize arrays */ 85 for (i = 0; i < N; i++) { 86 x[i] = (float)(rand() % 100); 87 for (j = 0; j < M; j++) 88 a[i * M + j] = (float)(rand() % 100); 89 } 90 91 transnorm<<<NBLK, BLKSIZE>>>(a, atrans, x, y); 92 /* Wait for all devices to finish */ 93 cudaDeviceSynchronize(); 94 95 pretty_print_2d(a, "A", N, M); 96 pretty_print_2d(atrans, "A_trans", M, N); 97 pretty_print_1d(x, "X", M); 98 pretty_print_1d(y, "Y", M); 99 100 cudaFree(a); 101 cudaFree(atrans); 102 cudaFree(x); 103 cudaFree(y); 104 105 return (0); 106 }