uni

University stuff
git clone git://git.margiolis.net/uni.git
Log | Files | Refs | README | LICENSE

commit ee20f660a7120358db0a81a4f3e02f60f402c26f
parent 307ec6e9f8bc37f1f18defe3f2e9906c686e958d
Author: Christos Margiolis <christos@margiolis.net>
Date:   Tue, 17 Jan 2023 15:45:46 +0200

foobar

Diffstat:
Rc_parallel_systems/ex1/Makefile -> c_cuda_parallel_systems/ex1/Makefile | 0
Rc_parallel_systems/ex1/doc.pdf -> c_cuda_parallel_systems/ex1/doc.pdf | 0
Rc_parallel_systems/ex1/doc.tex -> c_cuda_parallel_systems/ex1/doc.tex | 0
Rc_parallel_systems/ex1/ex1.c -> c_cuda_parallel_systems/ex1/ex1.c | 0
Rc_parallel_systems/ex1/randinput -> c_cuda_parallel_systems/ex1/randinput | 0
Rc_parallel_systems/ex1/res/run1.png -> c_cuda_parallel_systems/ex1/res/run1.png | 0
Rc_parallel_systems/ex1/res/run2.png -> c_cuda_parallel_systems/ex1/res/run2.png | 0
Rc_parallel_systems/ex1/res/run3.png -> c_cuda_parallel_systems/ex1/res/run3.png | 0
Rc_parallel_systems/ex1/res/run4.png -> c_cuda_parallel_systems/ex1/res/run4.png | 0
Rc_parallel_systems/ex1/res/run5.png -> c_cuda_parallel_systems/ex1/res/run5.png | 0
Rc_parallel_systems/ex1/res/run6.png -> c_cuda_parallel_systems/ex1/res/run6.png | 0
Rc_parallel_systems/ex1/res/run7.png -> c_cuda_parallel_systems/ex1/res/run7.png | 0
Ac_cuda_parallel_systems/ex2/ex2a.c | 130+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ac_cuda_parallel_systems/ex2/ex2b_a.cu | 116+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ac_cuda_parallel_systems/ex2/ex2b_b.cu | 103+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dc_parallel_systems/ex2/Makefile | 5-----
Dc_parallel_systems/ex2/ex2a.c | 130-------------------------------------------------------------------------------
17 files changed, 349 insertions(+), 135 deletions(-)

diff --git a/c_parallel_systems/ex1/Makefile b/c_cuda_parallel_systems/ex1/Makefile diff --git a/c_parallel_systems/ex1/doc.pdf b/c_cuda_parallel_systems/ex1/doc.pdf Binary files differ. diff --git a/c_parallel_systems/ex1/doc.tex b/c_cuda_parallel_systems/ex1/doc.tex diff --git a/c_parallel_systems/ex1/ex1.c b/c_cuda_parallel_systems/ex1/ex1.c diff --git a/c_parallel_systems/ex1/randinput b/c_cuda_parallel_systems/ex1/randinput diff --git a/c_parallel_systems/ex1/res/run1.png b/c_cuda_parallel_systems/ex1/res/run1.png Binary files differ. diff --git a/c_parallel_systems/ex1/res/run2.png b/c_cuda_parallel_systems/ex1/res/run2.png Binary files differ. diff --git a/c_parallel_systems/ex1/res/run3.png b/c_cuda_parallel_systems/ex1/res/run3.png Binary files differ. diff --git a/c_parallel_systems/ex1/res/run4.png b/c_cuda_parallel_systems/ex1/res/run4.png Binary files differ. diff --git a/c_parallel_systems/ex1/res/run5.png b/c_cuda_parallel_systems/ex1/res/run5.png Binary files differ. diff --git a/c_parallel_systems/ex1/res/run6.png b/c_cuda_parallel_systems/ex1/res/run6.png Binary files differ. diff --git a/c_parallel_systems/ex1/res/run7.png b/c_cuda_parallel_systems/ex1/res/run7.png Binary files differ. diff --git a/c_cuda_parallel_systems/ex2/ex2a.c b/c_cuda_parallel_systems/ex2/ex2a.c @@ -0,0 +1,130 @@ +#include <err.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +#include <omp.h> + +static void pretty_print(int *, int, const char *); +static int cmpfunc(const void *, const void *); +static void merge(int *, int *, int *, int *, int *); +static void multisort(int *, int *, int); + +/* + * Print the contents of a 2D array like: + * + * array = [x, y, z] + */ +static void +pretty_print(int *arr, int n, const char *name) +{ + int i; + + printf("\n%s = [", name); + for (i = 0; i < n; i++) + printf("%d%s", arr[i], (i == n - 1) ? "" : ", "); + printf("]\n"); +} + +/* + * Passed to qsort(3). + */ +static int +cmpfunc(const void *a, const void *b) +{ + return (*(int *)a - *(int *)b); +} + +static void +merge(int *a, int *enda, int *b, int *endb, int *res) +{ + while (a <= enda && b <= endb) { + if (*a < *b) + *res++ = *a++; + else + *res++ = *b++; + } + while (a <= enda) + *res++ = *a++; + while (b <= endb) + *res++ = *b++; +} + +static void +multisort(int *arr, int *space, int n) +{ + int quarter, *sta, *spa, *stb, *spb, *stc, *spc, *std, *spd; + + if ((quarter = n / 4) < 4) + qsort(arr, n, sizeof(int), cmpfunc); + else { + /* Split the array into 4 quarters. */ + sta = arr; + spa = space; + stb = sta + quarter; + spb = spa + quarter; + stc = stb + quarter; + spc = spb + quarter; + std = stc + quarter; + spd = spc + quarter; +#pragma omp task + multisort(sta, spa, quarter); +#pragma omp task + multisort(stb, spb, quarter); +#pragma omp task + multisort(stc, spc, quarter); +#pragma omp task + multisort(std, spd, n - 3 * quarter); + /* Wait for the tasks above to finish. */ +#pragma omp taskwait +#pragma omp task + /* Merge A and B into SpaceA */ + merge(sta, sta + quarter - 1, stb, stb + quarter - 1, spa); +#pragma omp task + /* Merge C and D into SpaceC */ + merge(stc, stc + quarter - 1, std, arr + n - 1, spc); +#pragma omp taskwait + /* Merge the two resulting couples (SpaceA and SpaceC). */ + merge(spa, spc - 1, spc, space + n - 1, arr); + } +} + +int +main(int argc, char *argv[]) +{ + int *a, *space, i, n, ntd; + double start, end; + + + if (argc < 3) { + fprintf(stderr, "usage: %s nthreads n\n", *argv); + return (1); + } + if ((ntd = atoi(argv[1])) < 1) + err(1, "can't use nthreads n < 1"); + if ((n = atoi(argv[2])) < 1) + err(1, "can't use n < 1"); + + srand(time(NULL)); + omp_set_num_threads(ntd); + + if ((a = malloc(n * sizeof(int))) == NULL) + err(1, "malloc"); + if ((space = malloc(n * sizeof(int))) == NULL) + err(1, "malloc"); + for (i = 0; i < n; i++) + a[i] = rand() % 100; + + start = omp_get_wtime(); + + pretty_print(a, n, "A_unsorted"); + multisort(a, space, n); + pretty_print(a, n, "A_multisort"); + + end = omp_get_wtime(); + printf("Total time: %f seconds\n", end - start); + + free(a); + + return (0); +} diff --git a/c_cuda_parallel_systems/ex2/ex2b_a.cu b/c_cuda_parallel_systems/ex2/ex2b_a.cu @@ -0,0 +1,116 @@ +#include <stdio.h> +#include <time.h> + +#define N (1 << 2) +#define DIM (N * N) +#define BLKSIZE (1 << 8) +#define NBLK ((N + BLKSIZE - 1) / BLKSIZE) + +__global__ void +convolution(float *a, float *aconv) +{ + float c11, c12, c13, c21, c22, c23, c31, c32, c33; + int i, j, x, stridex; + + /* each thread gets a slice of the rows to work with */ + x = blockIdx.x * blockDim.x + threadIdx.x; + stridex = blockDim.x * gridDim.x; + + c11 = +0.2; c21 = +0.5; c31 = -0.8; + c12 = -0.3; c22 = +0.6; c32 = -0.9; + c13 = +0.4; c23 = +0.7; c33 = +0.10; + + if (x < 1 || x > N - 1) + return; + for (i = x; i < N - 1; i += stridex) { + for (j = 1; j < N - 1; j++) { + aconv[i * N + j] = + c11 * a[(i - 1) * N + (j - 1)] + + c12 * a[i * N + (j - 1)] + + c13 * a[(i + 1) * N + (j - 1)] + + c21 * a[(i - 1) * N + j] + + c22 * a[i * N + j] + + c23 * a[(i + 1) * N + j] + + c31 * a[(i - 1) * N + (j + 1)] + + c32 * a[i * N + (j + 1)] + + c33 * a[(i + 1) * N + (j + 1)]; + } + } +} + +__global__ void +min_diagonal(float *arr, float *min_arr) +{ + int x, stridex, i; + + x = blockIdx.x * blockDim.x + threadIdx.x; + stridex = blockDim.x * gridDim.x; + + if (x >= N) + return; + /* calculate local minimums */ + min_arr[x] = arr[x * N + x]; + for (i = x; i < N; i += stridex) + if (arr[i * N + i] < min_arr[x]) + min_arr[x] = arr[i * N + i]; +} + +static void +pretty_print(float *arr, const char *name) +{ + int i, j; + + printf("\n%s = [\n", name); + for (i = 0; i < N; i++) { + printf("\t["); + for (j = 0; j < N; j++) { + printf("%.2f%s", arr[i * N + j], + (j == N - 1) ? "]\n" : ", "); + } + } + printf("]\n"); +} + +int +main(int argc, char *argv[]) +{ + float *a, *aconv, *min_arr, min; + int i; + + srand(time(NULL)); + + /* + * use unified memory to avoid having additional device arrays and + * memcpying from host to device and vice versa + */ + cudaMallocManaged(&a, DIM * sizeof(float)); + cudaMallocManaged(&aconv, DIM * sizeof(float)); + cudaMallocManaged(&min_arr, DIM * sizeof(float)); + + /* initialize array */ + for (i = 0; i < DIM; i++) + a[i] = (float)(rand() % 100); + + convolution<<<NBLK, BLKSIZE>>>(a, aconv); + /* wait for all devices to finish */ + cudaDeviceSynchronize(); + + min_diagonal<<<NBLK, BLKSIZE>>>(aconv, min_arr); + cudaDeviceSynchronize(); + + /* find global minimum */ + min = min_arr[0]; + for (i = 0; i < N; i++) + if (min_arr[i] < min) + min = min_arr[i]; + + pretty_print(a, "A"); + pretty_print(aconv, "A_conv"); + printf("Min_diagonal(A_conv): %.2f\n", min); + + cudaFree(a); + cudaFree(aconv); + cudaFree(min_arr); + + return (0); +} diff --git a/c_cuda_parallel_systems/ex2/ex2b_b.cu b/c_cuda_parallel_systems/ex2/ex2b_b.cu @@ -0,0 +1,103 @@ +#include <stdio.h> +#include <time.h> + +#define N (1 << 2) +#define M (1 << 1) +#define DIM (N * M) +#define BLKSIZE (1 << 8) +#define NBLK ((DIM + BLKSIZE - 1) / BLKSIZE) + +__global__ void +transnorm(float *a, float *atrans, float *x, float *y) +{ + int i, j, idx, stridex; + + /* each thread gets a slice of the rows to work with */ + idx = blockIdx.x * blockDim.x + threadIdx.x; + stridex = blockDim.x * gridDim.x; + + if (idx >= N) + return; + /* first thread initializes y */ + if (threadIdx.x == 0) { + for (i = 0; i < M; i++) + y[i] = 0; + } + for (i = idx; i < N; i += stridex) { + for (j = 0; j < M; j++) { + /* transpose a */ + atrans[j * N + i] = a[i * M + j]; + y[j] = atrans[j * M + i] * a[i * M + j] * x[j]; + } + } +} + +static void +pretty_print_1d(float *arr, const char *name, int n) +{ + int i; + + printf("\n%s = [", name); + for (i = 0; i < n; i++) { + printf("%.2f%s", arr[i], + (i == n - 1) ? "" : ", "); + } + printf("]\n"); +} + +static void +pretty_print_2d(float *arr, const char *name, int w, int h) +{ + int i, j; + + printf("\n%s = [\n", name); + for (i = 0; i < w; i++) { + printf("\t["); + for (j = 0; j < h; j++) { + printf("%.2f%s", arr[i * h + j], + (j == h - 1) ? "]\n" : ", "); + } + } + printf("]\n"); +} + +int +main(int argc, char *argv[]) +{ + float *a, *atrans, *x, *y; + int i, j; + + srand(time(NULL)); + + /* + * use unified memory to avoid having additional device arrays and + * memcpying from host to device and vice versa + */ + cudaMallocManaged(&a, DIM * sizeof(float)); + cudaMallocManaged(&atrans, DIM * sizeof(float)); + cudaMallocManaged(&x, M * sizeof(float)); + cudaMallocManaged(&y, M * sizeof(float)); + + /* initialize array */ + for (i = 0; i < N; i++) { + x[i] = (float)(rand() % 100); + for (j = 0; j < M; j++) + a[i * M + j] = (float)(rand() % 100); + } + + transnorm<<<NBLK, BLKSIZE>>>(a, atrans, x, y); + /* wait for all devices to finish */ + cudaDeviceSynchronize(); + + pretty_print_2d(a, "A", N, M); + pretty_print_2d(atrans, "A_trans", M, N); + pretty_print_1d(x, "X", M); + pretty_print_1d(y, "Y", M); + + cudaFree(a); + cudaFree(atrans); + cudaFree(x); + cudaFree(y); + + return (0); +} diff --git a/c_parallel_systems/ex2/Makefile b/c_parallel_systems/ex2/Makefile @@ -1,5 +0,0 @@ -all: - cc ex2a.c -fopenmp -lomp -o ex2a - -clean: - rm -f ex2a *.core diff --git a/c_parallel_systems/ex2/ex2a.c b/c_parallel_systems/ex2/ex2a.c @@ -1,130 +0,0 @@ -#include <err.h> -#include <stdio.h> -#include <stdlib.h> -#include <time.h> - -#include <omp.h> - -static void pretty_print(int *, int, const char *); -static int cmpfunc(const void *, const void *); -static void merge(int *, int *, int *, int *, int *); -static void multisort(int *, int *, int); - -/* - * Print the contents of a 2D array like: - * - * array = [x, y, z] - */ -static void -pretty_print(int *arr, int n, const char *name) -{ - int i; - - printf("\n%s = [", name); - for (i = 0; i < n; i++) - printf("%d%s", arr[i], (i == n - 1) ? "" : ", "); - printf("]\n"); -} - -/* - * Passed to qsort(3). - */ -static int -cmpfunc(const void *a, const void *b) -{ - return (*(int *)a - *(int *)b); -} - -static void -merge(int *a, int *enda, int *b, int *endb, int *res) -{ - while (a <= enda && b <= endb) { - if (*a < *b) - *res++ = *a++; - else - *res++ = *b++; - } - while (a <= enda) - *res++ = *a++; - while (b <= endb) - *res++ = *b++; -} - -static void -multisort(int *arr, int *space, int n) -{ - int quarter, *sta, *spa, *stb, *spb, *stc, *spc, *std, *spd; - - if ((quarter = n / 4) < 4) - qsort(arr, n, sizeof(int), cmpfunc); - else { - /* Split the array into 4 quarters. */ - sta = arr; - spa = space; - stb = sta + quarter; - spb = spa + quarter; - stc = stb + quarter; - spc = spb + quarter; - std = stc + quarter; - spd = spc + quarter; -#pragma omp task - multisort(sta, spa, quarter); -#pragma omp task - multisort(stb, spb, quarter); -#pragma omp task - multisort(stc, spc, quarter); -#pragma omp task - multisort(std, spd, n - 3 * quarter); - /* Wait for the tasks above to finish. */ -#pragma omp taskwait -#pragma omp task - /* Merge A and B into SpaceA */ - merge(sta, sta + quarter - 1, stb, stb + quarter - 1, spa); -#pragma omp task - /* Merge C and D into SpaceC */ - merge(stc, stc + quarter - 1, std, arr + n - 1, spc); -#pragma omp taskwait - /* Merge the two resulting couples (SpaceA and SpaceC). */ - merge(spa, spc - 1, spc, space + n - 1, arr); - } -} - -int -main(int argc, char *argv[]) -{ - int *a, *space, i, n, ntd; - double start, end; - - - if (argc < 3) { - fprintf(stderr, "usage: %s nthreads n\n", getprogname()); - return (1); - } - if ((ntd = atoi(argv[1])) < 1) - err(1, "can't use nthreads n < 1"); - if ((n = atoi(argv[2])) < 1) - err(1, "can't use n < 1"); - - srand(time(NULL)); - omp_set_num_threads(ntd); - - if ((a = malloc(n * sizeof(int))) == NULL) - err(1, "malloc"); - if ((space = malloc(n * sizeof(int))) == NULL) - err(1, "malloc"); - for (i = 0; i < n; i++) - a[i] = rand() % 100; - - start = omp_get_wtime(); - - pretty_print(a, n, "A_unsorted"); - multisort(a, space, n); - pretty_print(a, n, "A_multisort"); - - end = omp_get_wtime(); - printf("Total time: %f seconds\n", end - start); - - free(a); - - return (0); -}