ex2b_b.cu - uni - University stuff

ex2b_b.cu (2151B)
      1 #include <stdio.h>
      2 #include <time.h>
      3 
      4 #define N	(1 << 3)
      5 #define M	(1 << 3)
      6 #define DIM	(N * M)
      7 #define BLKSIZE	(1 << 10)
      8 #define NBLK	((DIM + BLKSIZE - 1) / BLKSIZE)
      9 
     10 /*
     11  * Calculations taken from lab's example code.
     12  */
     13 __global__ void
     14 transnorm(float *a, float *atrans, float *x, float *y)
     15 {
     16 	int i, j, idx, stridex;
     17 
     18 	/* Each thread gets a slice of the rows to work with */
     19 	idx = blockIdx.x * blockDim.x + threadIdx.x;
     20 	stridex = blockDim.x * gridDim.x;
     21 
     22 	if (idx >= N)
     23 		return;
     24 	/* First thread initializes y */
     25 	if (threadIdx.x == 0) {
     26 		for (i = 0; i < M; i++)
     27 			y[i] = 0;
     28 	}
     29 	for (i = idx; i < N; i += stridex) {
     30 		for (j = 0; j < M; j++) {
     31 			/* Transpose A */
     32 			atrans[j * N + i] = a[i * M + j];
     33 			y[j] = atrans[j * M + i] * a[i * M + j] * x[j];
     34 		}
     35 	}
     36 }
     37 
     38 static void
     39 pretty_print_1d(float *arr, const char *name, int n)
     40 {
     41 	int i;
     42 
     43 	printf("\n%s = [", name);
     44 	for (i = 0; i < n; i++) {
     45 		printf("%.2f%s", arr[i],
     46 		   (i == n - 1) ? "" : ", ");
     47 	}
     48 	printf("]\n");
     49 }
     50 
     51 static void
     52 pretty_print_2d(float *arr, const char *name, int w, int h)
     53 {
     54 	int i, j;
     55 
     56 	printf("\n%s = [\n", name);
     57 	for (i = 0; i < w; i++) {
     58 		printf("\t[");
     59 		for (j = 0; j < h; j++) {
     60 			printf("%.2f%s", arr[i * h + j],
     61 			   (j == h - 1) ? "]\n" : ", ");
     62 		}
     63 	}
     64 	printf("]\n");
     65 }
     66 
     67 int
     68 main(int argc, char *argv[])
     69 {
     70 	float *a, *atrans, *x, *y;
     71 	int i, j;
     72 
     73 	srand(time(NULL));
     74 
     75 	/*
     76 	 * Use unified memory to avoid having additional device arrays and
     77 	 * memcpying from host to device and vice versa.
     78 	 */
     79 	cudaMallocManaged(&a, DIM * sizeof(float));
     80 	cudaMallocManaged(&atrans, DIM * sizeof(float));
     81 	cudaMallocManaged(&x, M * sizeof(float));
     82 	cudaMallocManaged(&y, M * sizeof(float));
     83 
     84 	/* Initialize arrays */
     85 	for (i = 0; i < N; i++) {
     86 		x[i] = (float)(rand() % 100);
     87 		for (j = 0; j < M; j++)
     88 			a[i * M + j] = (float)(rand() % 100);
     89 	}
     90 
     91 	transnorm<<<NBLK, BLKSIZE>>>(a, atrans, x, y);
     92 	/* Wait for all devices to finish */
     93 	cudaDeviceSynchronize();
     94 
     95 	pretty_print_2d(a, "A", N, M);
     96 	pretty_print_2d(atrans, "A_trans", M, N);
     97 	pretty_print_1d(x, "X", M);
     98 	pretty_print_1d(y, "Y", M);
     99 
    100 	cudaFree(a);
    101 	cudaFree(atrans);
    102 	cudaFree(x);
    103 	cudaFree(y);
    104 
    105 	return (0);
    106 }
	uni University stuff
	git clone git://git.margiolis.net/uni.git
	Log \| Files \| Refs \| README \| LICENSE