uni

University stuff
git clone git://git.margiolis.net/uni.git
Log | Files | Refs | README | LICENSE

commit ffcdbe764a8b51912ea821f6c30fb6960a43dbf2
parent ee20f660a7120358db0a81a4f3e02f60f402c26f
Author: Christos Margiolis <christos@margiolis.net>
Date:   Tue, 31 Jan 2023 14:17:00 +0200

done

Diffstat:
Ac_cuda_parallel_systems/ex2/doc.pdf | 0
Ac_cuda_parallel_systems/ex2/doc.tex | 107+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ac_cuda_parallel_systems/ex2/doc.toc | 24++++++++++++++++++++++++
Mc_cuda_parallel_systems/ex2/ex2a.c | 11++++++++++-
Mc_cuda_parallel_systems/ex2/ex2b_a.cu | 36+++++++++++++++++++++++++++---------
Mc_cuda_parallel_systems/ex2/ex2b_b.cu | 25++++++++++++++-----------
Ac_cuda_parallel_systems/ex2/res/ex2a_1.png | 0
Ac_cuda_parallel_systems/ex2/res/ex2a_2.png | 0
Ac_cuda_parallel_systems/ex2/res/ex2a_3.png | 0
Ac_cuda_parallel_systems/ex2/res/ex2b_a_1.png | 0
Ac_cuda_parallel_systems/ex2/res/ex2b_a_2.png | 0
Ac_cuda_parallel_systems/ex2/res/ex2b_a_3.png | 0
Ac_cuda_parallel_systems/ex2/res/ex2b_b_1.png | 0
Ac_cuda_parallel_systems/ex2/res/ex2b_b_2.png | 0
Ac_cuda_parallel_systems/ex2/res/ex2b_b_3.png | 0
15 files changed, 182 insertions(+), 21 deletions(-)

diff --git a/c_cuda_parallel_systems/ex2/doc.pdf b/c_cuda_parallel_systems/ex2/doc.pdf Binary files differ. diff --git a/c_cuda_parallel_systems/ex2/doc.tex b/c_cuda_parallel_systems/ex2/doc.tex @@ -0,0 +1,107 @@ +\documentclass{article} +\usepackage[utf8]{inputenc} +\usepackage[greek,english]{babel} +\usepackage{alphabeta} +\usepackage{fancyhdr} +\usepackage{listings} +\usepackage{mathtools} +\usepackage{xcolor} +\usepackage{biblatex} +\usepackage[left=1cm,right=1cm]{geometry} + +\lstset { + basicstyle=\ttfamily, + columns=fullflexible, + breaklines=true, + keepspaces=true, + showstringspaces=false +} + +\title{Εργαστήριο Παράλληλων Συστημάτων - Εργασία 2} +\author{Χρήστος Μαργιώλης} +\date{Ιανουάριος 2023} + +\begin{document} + +\begin{titlepage} + \maketitle +\end{titlepage} + +\renewcommand{\contentsname}{Περιεχόμενα} +\tableofcontents +\pagebreak + +\section{Προγράμματα} + +Οι κώδικες έχουν σχόλια μόνο στα σημεία που θεώρησα ότι μπορεί να προκύψει +κάποιο «μπέρδεμα». + +\subsection{'Ασκηση 2Α} + +\subsubsection{Κώδικας} +\lstinputlisting[language=C]{ex2a.c} +\pagebreak +\subsubsection{Ενδεικτικά τρεξίματα} + +\begin{lstlisting} +usage: ./a.out nthreads n +\end{lstlisting} + +Για \lstinline{nthreads = 2} και \lstinline{n = 10}: \\ + +\includegraphics[width=\textwidth]{res/ex2a_1.png} \\ + +Για \lstinline{nthreads = 8} και \lstinline{n = 100}: \\ + +\includegraphics[width=\textwidth]{res/ex2a_2.png} \\ + +Για \lstinline{nthreads = 16} και \lstinline{n = 1000000}. Λόγω του αριθμού των +στοιχείων, το στιγμιότυπο δείχνει μόνο τον χρόνο υπολογισμού: \\ + +\includegraphics[width=\textwidth]{res/ex2a_3.png} \\ +\pagebreak + +\subsection{'Ασκηση 2Β-Α} + +\subsubsection{Κώδικας} +\lstinputlisting[language=C]{ex2b_a.cu} +\pagebreak +\subsubsection{Ενδεικτικά τρεξίματα} + +Για \lstinline{NxN = 8x8} και \lstinline{blocksize = 256} \\ + +\includegraphics[width=\textwidth]{res/ex2b_a_1.png} \\ + +Για \lstinline{NxN = 32x32} και \lstinline{blocksize = 1024} \\ + +\includegraphics[width=\textwidth]{res/ex2b_a_2.png} \\ + +Για \lstinline{NxN = 4x4} και \lstinline{blocksize = 256} \\ + +\includegraphics[width=\textwidth]{res/ex2b_a_3.png} \\ +\pagebreak + +\subsection{'Ασκηση 2Β-Β} + +\subsubsection{Κώδικας} +\lstinputlisting[language=C]{ex2b_b.cu} +\pagebreak +\subsubsection{Ενδεικτικά τρεξίματα} + +Για \lstinline{NxM = 4x2} και \lstinline{blocksize = 256} \\ + +\includegraphics[width=\textwidth]{res/ex2b_b_1.png} \\ + +Για \lstinline{NxM = 4x8} και \lstinline{blocksize = 256} \\ + +\includegraphics[width=\textwidth]{res/ex2b_b_2.png} \\ + +Για \lstinline{NxM = 8x8} και \lstinline{blocksize = 1024} \\ + +\includegraphics[width=\textwidth]{res/ex2b_b_3.png} \\ + +\section{Προβλήματα} + +Δεν υλοποίησα την άσκηση 2B-Γ (συνδιακύμανση). + +\end{document} diff --git a/c_cuda_parallel_systems/ex2/doc.toc b/c_cuda_parallel_systems/ex2/doc.toc @@ -0,0 +1,24 @@ +\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax +\babel@toc {english}{} +\defcounter {refsection}{0}\relax +\contentsline {section}{\numberline {1}Προγράμματα}{2}{}% +\defcounter {refsection}{0}\relax +\contentsline {subsection}{\numberline {1.1}'Ασκηση 2Α}{2}{}% +\defcounter {refsection}{0}\relax +\contentsline {subsubsection}{\numberline {1.1.1}Κώδικας}{2}{}% +\defcounter {refsection}{0}\relax +\contentsline {subsubsection}{\numberline {1.1.2}Ενδεικτικά τρεξίματα}{6}{}% +\defcounter {refsection}{0}\relax +\contentsline {subsection}{\numberline {1.2}'Ασκηση 2Β-Α}{7}{}% +\defcounter {refsection}{0}\relax +\contentsline {subsubsection}{\numberline {1.2.1}Κώδικας}{7}{}% +\defcounter {refsection}{0}\relax +\contentsline {subsubsection}{\numberline {1.2.2}Ενδεικτικά τρεξίματα}{10}{}% +\defcounter {refsection}{0}\relax +\contentsline {subsection}{\numberline {1.3}'Ασκηση 2Β-Β}{13}{}% +\defcounter {refsection}{0}\relax +\contentsline {subsubsection}{\numberline {1.3.1}Κώδικας}{13}{}% +\defcounter {refsection}{0}\relax +\contentsline {subsubsection}{\numberline {1.3.2}Ενδεικτικά τρεξίματα}{16}{}% +\defcounter {refsection}{0}\relax +\contentsline {section}{\numberline {2}Προβλήματα}{18}{}% diff --git a/c_cuda_parallel_systems/ex2/ex2a.c b/c_cuda_parallel_systems/ex2/ex2a.c @@ -11,7 +11,7 @@ static void merge(int *, int *, int *, int *, int *); static void multisort(int *, int *, int); /* - * Print the contents of a 2D array like: + * Print the contents of an array like: * * array = [x, y, z] */ @@ -35,6 +35,9 @@ cmpfunc(const void *a, const void *b) return (*(int *)a - *(int *)b); } +/* + * Merge sort + */ static void merge(int *a, int *enda, int *b, int *endb, int *res) { @@ -55,6 +58,10 @@ multisort(int *arr, int *space, int n) { int quarter, *sta, *spa, *stb, *spb, *stc, *spc, *std, *spd; + /* + * Sort with qsort(3) directly if we can't split the array into 4 + * quarters. + */ if ((quarter = n / 4) < 4) qsort(arr, n, sizeof(int), cmpfunc); else { @@ -67,6 +74,7 @@ multisort(int *arr, int *space, int n) spc = spb + quarter; std = stc + quarter; spd = spc + quarter; + /* Sort each quarter */ #pragma omp task multisort(sta, spa, quarter); #pragma omp task @@ -115,6 +123,7 @@ main(int argc, char *argv[]) for (i = 0; i < n; i++) a[i] = rand() % 100; + /* Calculate speed up */ start = omp_get_wtime(); pretty_print(a, n, "A_unsorted"); diff --git a/c_cuda_parallel_systems/ex2/ex2b_a.cu b/c_cuda_parallel_systems/ex2/ex2b_a.cu @@ -3,8 +3,14 @@ #define N (1 << 2) #define DIM (N * N) +/* + * This formula for calculating the number of blocks is mentioned at "out of + * the blocks" section in: + * + * https://developer.nvidia.com/blog/even-easier-introduction-cuda/ + */ #define BLKSIZE (1 << 8) -#define NBLK ((N + BLKSIZE - 1) / BLKSIZE) +#define NBLK ((DIM + BLKSIZE - 1) / BLKSIZE) __global__ void convolution(float *a, float *aconv) @@ -12,10 +18,16 @@ convolution(float *a, float *aconv) float c11, c12, c13, c21, c22, c23, c31, c32, c33; int i, j, x, stridex; - /* each thread gets a slice of the rows to work with */ + /* + * Each thread gets a slice of the rows to work with. Grid-stride idiom + * mentioned at section "out of the blocks" in: + * + * https://developer.nvidia.com/blog/even-easier-introduction-cuda/ + */ x = blockIdx.x * blockDim.x + threadIdx.x; stridex = blockDim.x * gridDim.x; + /* Random weight values */ c11 = +0.2; c21 = +0.5; c31 = -0.8; c12 = -0.3; c22 = +0.6; c32 = -0.9; c13 = +0.4; c23 = +0.7; c33 = +0.10; @@ -24,6 +36,7 @@ convolution(float *a, float *aconv) return; for (i = x; i < N - 1; i += stridex) { for (j = 1; j < N - 1; j++) { + /* Taken from the lab's example code. */ aconv[i * N + j] = c11 * a[(i - 1) * N + (j - 1)] + c12 * a[i * N + (j - 1)] + @@ -48,7 +61,7 @@ min_diagonal(float *arr, float *min_arr) if (x >= N) return; - /* calculate local minimums */ + /* Calculate local minimums */ min_arr[x] = arr[x * N + x]; for (i = x; i < N; i += stridex) if (arr[i * N + i] < min_arr[x]) @@ -79,26 +92,31 @@ main(int argc, char *argv[]) srand(time(NULL)); - /* - * use unified memory to avoid having additional device arrays and - * memcpying from host to device and vice versa + /* + * Use unified memory to avoid having additional device arrays and + * memcpying from host to device and vice versa. + * + * https://developer.nvidia.com/blog/unified-memory-cuda-beginners/ */ cudaMallocManaged(&a, DIM * sizeof(float)); cudaMallocManaged(&aconv, DIM * sizeof(float)); cudaMallocManaged(&min_arr, DIM * sizeof(float)); - /* initialize array */ + /* Initialize array */ for (i = 0; i < DIM; i++) a[i] = (float)(rand() % 100); convolution<<<NBLK, BLKSIZE>>>(a, aconv); - /* wait for all devices to finish */ + /* Wait for all devices to finish */ cudaDeviceSynchronize(); min_diagonal<<<NBLK, BLKSIZE>>>(aconv, min_arr); cudaDeviceSynchronize(); - /* find global minimum */ + /* + * Find global minimum using the local minimums calculated in + * min_diagonal(). + */ min = min_arr[0]; for (i = 0; i < N; i++) if (min_arr[i] < min) diff --git a/c_cuda_parallel_systems/ex2/ex2b_b.cu b/c_cuda_parallel_systems/ex2/ex2b_b.cu @@ -1,31 +1,34 @@ #include <stdio.h> #include <time.h> -#define N (1 << 2) -#define M (1 << 1) +#define N (1 << 3) +#define M (1 << 3) #define DIM (N * M) -#define BLKSIZE (1 << 8) +#define BLKSIZE (1 << 10) #define NBLK ((DIM + BLKSIZE - 1) / BLKSIZE) +/* + * Calculations taken from lab's example code. + */ __global__ void transnorm(float *a, float *atrans, float *x, float *y) { int i, j, idx, stridex; - /* each thread gets a slice of the rows to work with */ + /* Each thread gets a slice of the rows to work with */ idx = blockIdx.x * blockDim.x + threadIdx.x; stridex = blockDim.x * gridDim.x; if (idx >= N) return; - /* first thread initializes y */ + /* First thread initializes y */ if (threadIdx.x == 0) { for (i = 0; i < M; i++) y[i] = 0; } for (i = idx; i < N; i += stridex) { for (j = 0; j < M; j++) { - /* transpose a */ + /* Transpose A */ atrans[j * N + i] = a[i * M + j]; y[j] = atrans[j * M + i] * a[i * M + j] * x[j]; } @@ -69,16 +72,16 @@ main(int argc, char *argv[]) srand(time(NULL)); - /* - * use unified memory to avoid having additional device arrays and - * memcpying from host to device and vice versa + /* + * Use unified memory to avoid having additional device arrays and + * memcpying from host to device and vice versa. */ cudaMallocManaged(&a, DIM * sizeof(float)); cudaMallocManaged(&atrans, DIM * sizeof(float)); cudaMallocManaged(&x, M * sizeof(float)); cudaMallocManaged(&y, M * sizeof(float)); - /* initialize array */ + /* Initialize arrays */ for (i = 0; i < N; i++) { x[i] = (float)(rand() % 100); for (j = 0; j < M; j++) @@ -86,7 +89,7 @@ main(int argc, char *argv[]) } transnorm<<<NBLK, BLKSIZE>>>(a, atrans, x, y); - /* wait for all devices to finish */ + /* Wait for all devices to finish */ cudaDeviceSynchronize(); pretty_print_2d(a, "A", N, M); diff --git a/c_cuda_parallel_systems/ex2/res/ex2a_1.png b/c_cuda_parallel_systems/ex2/res/ex2a_1.png Binary files differ. diff --git a/c_cuda_parallel_systems/ex2/res/ex2a_2.png b/c_cuda_parallel_systems/ex2/res/ex2a_2.png Binary files differ. diff --git a/c_cuda_parallel_systems/ex2/res/ex2a_3.png b/c_cuda_parallel_systems/ex2/res/ex2a_3.png Binary files differ. diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_a_1.png b/c_cuda_parallel_systems/ex2/res/ex2b_a_1.png Binary files differ. diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_a_2.png b/c_cuda_parallel_systems/ex2/res/ex2b_a_2.png Binary files differ. diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_a_3.png b/c_cuda_parallel_systems/ex2/res/ex2b_a_3.png Binary files differ. diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_b_1.png b/c_cuda_parallel_systems/ex2/res/ex2b_b_1.png Binary files differ. diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_b_2.png b/c_cuda_parallel_systems/ex2/res/ex2b_b_2.png Binary files differ. diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_b_3.png b/c_cuda_parallel_systems/ex2/res/ex2b_b_3.png Binary files differ.