uni

University stuff
git clone git://git.margiolis.net/uni.git
Log | Files | Refs | README | LICENSE

ex2b_a.cu (3085B)


      1 #include <stdio.h>
      2 #include <time.h>
      3 
      4 #define N	(1 << 2)
      5 #define DIM	(N * N)
      6 /*
      7  * This formula for calculating the number of blocks is mentioned at "out of
      8  * the blocks" section in:
      9  *
     10  * https://developer.nvidia.com/blog/even-easier-introduction-cuda/
     11  */
     12 #define BLKSIZE	(1 << 8)
     13 #define NBLK	((DIM + BLKSIZE - 1) / BLKSIZE)
     14 
     15 __global__ void
     16 convolution(float *a, float *aconv)
     17 {
     18 	float c11, c12, c13, c21, c22, c23, c31, c32, c33;
     19 	int i, j, x, stridex;
     20 
     21 	/*
     22 	 * Each thread gets a slice of the rows to work with. Grid-stride idiom
     23 	 * mentioned at section "out of the blocks" in:
     24 	 *
     25 	 * https://developer.nvidia.com/blog/even-easier-introduction-cuda/
     26 	 */
     27 	x = blockIdx.x * blockDim.x + threadIdx.x;
     28 	stridex = blockDim.x * gridDim.x;
     29 
     30 	/* Random weight values */
     31 	c11 = +0.2;  c21 = +0.5;  c31 = -0.8;
     32 	c12 = -0.3;  c22 = +0.6;  c32 = -0.9;
     33 	c13 = +0.4;  c23 = +0.7;  c33 = +0.10;
     34 	
     35 	if (x < 1 || x > N - 1)
     36 		return;
     37 	for (i = x; i < N - 1; i += stridex) {
     38 		for (j = 1; j < N - 1; j++) {
     39 			/* Taken from the lab's example code. */
     40 			aconv[i * N + j] = 
     41 			    c11 * a[(i - 1)	* N + (j - 1)] +
     42 			    c12 * a[i		* N + (j - 1)] +
     43 			    c13 * a[(i + 1)	* N + (j - 1)] +
     44 			    c21 * a[(i - 1)	* N + j] +
     45 			    c22 * a[i		* N + j] +
     46 			    c23 * a[(i + 1)	* N + j] +
     47 			    c31 * a[(i - 1)	* N + (j + 1)] +
     48 			    c32 * a[i		* N + (j + 1)] +
     49 			    c33 * a[(i + 1)	* N + (j + 1)];
     50 		}
     51 	}
     52 }
     53 
     54 __global__ void
     55 min_diagonal(float *arr, float *min_arr)
     56 {
     57 	int x, stridex, i;
     58 
     59 	x = blockIdx.x * blockDim.x + threadIdx.x;
     60 	stridex = blockDim.x * gridDim.x;
     61 
     62 	if (x >= N)
     63 		return;
     64 	/* Calculate local minimums */
     65 	min_arr[x] = arr[x * N + x];
     66 	for (i = x; i < N; i += stridex)
     67 		if (arr[i * N + i] < min_arr[x])
     68 			min_arr[x] = arr[i * N + i];
     69 }
     70 
     71 static void
     72 pretty_print(float *arr, const char *name)
     73 {
     74 	int i, j;
     75 
     76 	printf("\n%s = [\n", name);
     77 	for (i = 0; i < N; i++) {
     78 		printf("\t[");
     79 		for (j = 0; j < N; j++) {
     80 			printf("%.2f%s", arr[i * N + j],
     81 			   (j == N - 1) ? "]\n" : ", ");
     82 		}
     83 	}
     84 	printf("]\n");
     85 }
     86 
     87 int
     88 main(int argc, char *argv[])
     89 {
     90 	float *a, *aconv, *min_arr, min;
     91 	int i;
     92 
     93 	srand(time(NULL));
     94 
     95 	/*
     96 	 * Use unified memory to avoid having additional device arrays and
     97 	 * memcpying from host to device and vice versa.
     98 	 *
     99 	 * https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
    100 	 */
    101 	cudaMallocManaged(&a, DIM * sizeof(float));
    102 	cudaMallocManaged(&aconv, DIM * sizeof(float));
    103 	cudaMallocManaged(&min_arr, DIM * sizeof(float));
    104 
    105 	/* Initialize array */
    106 	for (i = 0; i < DIM; i++)
    107 		a[i] = (float)(rand() % 100);
    108 
    109 	convolution<<<NBLK, BLKSIZE>>>(a, aconv);
    110 	/* Wait for all devices to finish */
    111 	cudaDeviceSynchronize();
    112 
    113 	min_diagonal<<<NBLK, BLKSIZE>>>(aconv, min_arr);
    114 	cudaDeviceSynchronize();
    115 
    116 	/*
    117 	 * Find global minimum using the local minimums calculated in
    118 	 * min_diagonal().
    119 	 */
    120 	min = min_arr[0];
    121 	for (i = 0; i < N; i++)
    122 		if (min_arr[i] < min)
    123 			min = min_arr[i];
    124 
    125 	pretty_print(a, "A");
    126 	pretty_print(aconv, "A_conv");
    127 	printf("Min_diagonal(A_conv): %.2f\n", min);
    128 
    129 	cudaFree(a);
    130 	cudaFree(aconv);
    131 	cudaFree(min_arr);
    132 
    133 	return (0);
    134 }