commit ffcdbe764a8b51912ea821f6c30fb6960a43dbf2
parent ee20f660a7120358db0a81a4f3e02f60f402c26f
Author: Christos Margiolis <christos@margiolis.net>
Date: Tue, 31 Jan 2023 14:17:00 +0200
done
Diffstat:
15 files changed, 182 insertions(+), 21 deletions(-)
diff --git a/c_cuda_parallel_systems/ex2/doc.pdf b/c_cuda_parallel_systems/ex2/doc.pdf
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/doc.tex b/c_cuda_parallel_systems/ex2/doc.tex
@@ -0,0 +1,107 @@
+\documentclass{article}
+\usepackage[utf8]{inputenc}
+\usepackage[greek,english]{babel}
+\usepackage{alphabeta}
+\usepackage{fancyhdr}
+\usepackage{listings}
+\usepackage{mathtools}
+\usepackage{xcolor}
+\usepackage{biblatex}
+\usepackage[left=1cm,right=1cm]{geometry}
+
+\lstset {
+ basicstyle=\ttfamily,
+ columns=fullflexible,
+ breaklines=true,
+ keepspaces=true,
+ showstringspaces=false
+}
+
+\title{Εργαστήριο Παράλληλων Συστημάτων - Εργασία 2}
+\author{Χρήστος Μαργιώλης}
+\date{Ιανουάριος 2023}
+
+\begin{document}
+
+\begin{titlepage}
+ \maketitle
+\end{titlepage}
+
+\renewcommand{\contentsname}{Περιεχόμενα}
+\tableofcontents
+\pagebreak
+
+\section{Προγράμματα}
+
+Οι κώδικες έχουν σχόλια μόνο στα σημεία που θεώρησα ότι μπορεί να προκύψει
+κάποιο «μπέρδεμα».
+
+\subsection{'Ασκηση 2Α}
+
+\subsubsection{Κώδικας}
+\lstinputlisting[language=C]{ex2a.c}
+\pagebreak
+\subsubsection{Ενδεικτικά τρεξίματα}
+
+\begin{lstlisting}
+usage: ./a.out nthreads n
+\end{lstlisting}
+
+Για \lstinline{nthreads = 2} και \lstinline{n = 10}: \\
+
+\includegraphics[width=\textwidth]{res/ex2a_1.png} \\
+
+Για \lstinline{nthreads = 8} και \lstinline{n = 100}: \\
+
+\includegraphics[width=\textwidth]{res/ex2a_2.png} \\
+
+Για \lstinline{nthreads = 16} και \lstinline{n = 1000000}. Λόγω του αριθμού των
+στοιχείων, το στιγμιότυπο δείχνει μόνο τον χρόνο υπολογισμού: \\
+
+\includegraphics[width=\textwidth]{res/ex2a_3.png} \\
+\pagebreak
+
+\subsection{'Ασκηση 2Β-Α}
+
+\subsubsection{Κώδικας}
+\lstinputlisting[language=C]{ex2b_a.cu}
+\pagebreak
+\subsubsection{Ενδεικτικά τρεξίματα}
+
+Για \lstinline{NxN = 8x8} και \lstinline{blocksize = 256} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_a_1.png} \\
+
+Για \lstinline{NxN = 32x32} και \lstinline{blocksize = 1024} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_a_2.png} \\
+
+Για \lstinline{NxN = 4x4} και \lstinline{blocksize = 256} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_a_3.png} \\
+\pagebreak
+
+\subsection{'Ασκηση 2Β-Β}
+
+\subsubsection{Κώδικας}
+\lstinputlisting[language=C]{ex2b_b.cu}
+\pagebreak
+\subsubsection{Ενδεικτικά τρεξίματα}
+
+Για \lstinline{NxM = 4x2} και \lstinline{blocksize = 256} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_b_1.png} \\
+
+Για \lstinline{NxM = 4x8} και \lstinline{blocksize = 256} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_b_2.png} \\
+
+Για \lstinline{NxM = 8x8} και \lstinline{blocksize = 1024} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_b_3.png} \\
+
+\section{Προβλήματα}
+
+Δεν υλοποίησα την άσκηση 2B-Γ (συνδιακύμανση).
+
+\end{document}
diff --git a/c_cuda_parallel_systems/ex2/doc.toc b/c_cuda_parallel_systems/ex2/doc.toc
@@ -0,0 +1,24 @@
+\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax
+\babel@toc {english}{}
+\defcounter {refsection}{0}\relax
+\contentsline {section}{\numberline {1}Προγράμματα}{2}{}%
+\defcounter {refsection}{0}\relax
+\contentsline {subsection}{\numberline {1.1}'Ασκηση 2Α}{2}{}%
+\defcounter {refsection}{0}\relax
+\contentsline {subsubsection}{\numberline {1.1.1}Κώδικας}{2}{}%
+\defcounter {refsection}{0}\relax
+\contentsline {subsubsection}{\numberline {1.1.2}Ενδεικτικά τρεξίματα}{6}{}%
+\defcounter {refsection}{0}\relax
+\contentsline {subsection}{\numberline {1.2}'Ασκηση 2Β-Α}{7}{}%
+\defcounter {refsection}{0}\relax
+\contentsline {subsubsection}{\numberline {1.2.1}Κώδικας}{7}{}%
+\defcounter {refsection}{0}\relax
+\contentsline {subsubsection}{\numberline {1.2.2}Ενδεικτικά τρεξίματα}{10}{}%
+\defcounter {refsection}{0}\relax
+\contentsline {subsection}{\numberline {1.3}'Ασκηση 2Β-Β}{13}{}%
+\defcounter {refsection}{0}\relax
+\contentsline {subsubsection}{\numberline {1.3.1}Κώδικας}{13}{}%
+\defcounter {refsection}{0}\relax
+\contentsline {subsubsection}{\numberline {1.3.2}Ενδεικτικά τρεξίματα}{16}{}%
+\defcounter {refsection}{0}\relax
+\contentsline {section}{\numberline {2}Προβλήματα}{18}{}%
diff --git a/c_cuda_parallel_systems/ex2/ex2a.c b/c_cuda_parallel_systems/ex2/ex2a.c
@@ -11,7 +11,7 @@ static void merge(int *, int *, int *, int *, int *);
static void multisort(int *, int *, int);
/*
- * Print the contents of a 2D array like:
+ * Print the contents of an array like:
*
* array = [x, y, z]
*/
@@ -35,6 +35,9 @@ cmpfunc(const void *a, const void *b)
return (*(int *)a - *(int *)b);
}
+/*
+ * Merge sort
+ */
static void
merge(int *a, int *enda, int *b, int *endb, int *res)
{
@@ -55,6 +58,10 @@ multisort(int *arr, int *space, int n)
{
int quarter, *sta, *spa, *stb, *spb, *stc, *spc, *std, *spd;
+ /*
+ * Sort with qsort(3) directly if we can't split the array into 4
+ * quarters.
+ */
if ((quarter = n / 4) < 4)
qsort(arr, n, sizeof(int), cmpfunc);
else {
@@ -67,6 +74,7 @@ multisort(int *arr, int *space, int n)
spc = spb + quarter;
std = stc + quarter;
spd = spc + quarter;
+ /* Sort each quarter */
#pragma omp task
multisort(sta, spa, quarter);
#pragma omp task
@@ -115,6 +123,7 @@ main(int argc, char *argv[])
for (i = 0; i < n; i++)
a[i] = rand() % 100;
+ /* Calculate speed up */
start = omp_get_wtime();
pretty_print(a, n, "A_unsorted");
diff --git a/c_cuda_parallel_systems/ex2/ex2b_a.cu b/c_cuda_parallel_systems/ex2/ex2b_a.cu
@@ -3,8 +3,14 @@
#define N (1 << 2)
#define DIM (N * N)
+/*
+ * This formula for calculating the number of blocks is mentioned at "out of
+ * the blocks" section in:
+ *
+ * https://developer.nvidia.com/blog/even-easier-introduction-cuda/
+ */
#define BLKSIZE (1 << 8)
-#define NBLK ((N + BLKSIZE - 1) / BLKSIZE)
+#define NBLK ((DIM + BLKSIZE - 1) / BLKSIZE)
__global__ void
convolution(float *a, float *aconv)
@@ -12,10 +18,16 @@ convolution(float *a, float *aconv)
float c11, c12, c13, c21, c22, c23, c31, c32, c33;
int i, j, x, stridex;
- /* each thread gets a slice of the rows to work with */
+ /*
+ * Each thread gets a slice of the rows to work with. Grid-stride idiom
+ * mentioned at section "out of the blocks" in:
+ *
+ * https://developer.nvidia.com/blog/even-easier-introduction-cuda/
+ */
x = blockIdx.x * blockDim.x + threadIdx.x;
stridex = blockDim.x * gridDim.x;
+ /* Random weight values */
c11 = +0.2; c21 = +0.5; c31 = -0.8;
c12 = -0.3; c22 = +0.6; c32 = -0.9;
c13 = +0.4; c23 = +0.7; c33 = +0.10;
@@ -24,6 +36,7 @@ convolution(float *a, float *aconv)
return;
for (i = x; i < N - 1; i += stridex) {
for (j = 1; j < N - 1; j++) {
+ /* Taken from the lab's example code. */
aconv[i * N + j] =
c11 * a[(i - 1) * N + (j - 1)] +
c12 * a[i * N + (j - 1)] +
@@ -48,7 +61,7 @@ min_diagonal(float *arr, float *min_arr)
if (x >= N)
return;
- /* calculate local minimums */
+ /* Calculate local minimums */
min_arr[x] = arr[x * N + x];
for (i = x; i < N; i += stridex)
if (arr[i * N + i] < min_arr[x])
@@ -79,26 +92,31 @@ main(int argc, char *argv[])
srand(time(NULL));
- /*
- * use unified memory to avoid having additional device arrays and
- * memcpying from host to device and vice versa
+ /*
+ * Use unified memory to avoid having additional device arrays and
+ * memcpying from host to device and vice versa.
+ *
+ * https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
*/
cudaMallocManaged(&a, DIM * sizeof(float));
cudaMallocManaged(&aconv, DIM * sizeof(float));
cudaMallocManaged(&min_arr, DIM * sizeof(float));
- /* initialize array */
+ /* Initialize array */
for (i = 0; i < DIM; i++)
a[i] = (float)(rand() % 100);
convolution<<<NBLK, BLKSIZE>>>(a, aconv);
- /* wait for all devices to finish */
+ /* Wait for all devices to finish */
cudaDeviceSynchronize();
min_diagonal<<<NBLK, BLKSIZE>>>(aconv, min_arr);
cudaDeviceSynchronize();
- /* find global minimum */
+ /*
+ * Find global minimum using the local minimums calculated in
+ * min_diagonal().
+ */
min = min_arr[0];
for (i = 0; i < N; i++)
if (min_arr[i] < min)
diff --git a/c_cuda_parallel_systems/ex2/ex2b_b.cu b/c_cuda_parallel_systems/ex2/ex2b_b.cu
@@ -1,31 +1,34 @@
#include <stdio.h>
#include <time.h>
-#define N (1 << 2)
-#define M (1 << 1)
+#define N (1 << 3)
+#define M (1 << 3)
#define DIM (N * M)
-#define BLKSIZE (1 << 8)
+#define BLKSIZE (1 << 10)
#define NBLK ((DIM + BLKSIZE - 1) / BLKSIZE)
+/*
+ * Calculations taken from lab's example code.
+ */
__global__ void
transnorm(float *a, float *atrans, float *x, float *y)
{
int i, j, idx, stridex;
- /* each thread gets a slice of the rows to work with */
+ /* Each thread gets a slice of the rows to work with */
idx = blockIdx.x * blockDim.x + threadIdx.x;
stridex = blockDim.x * gridDim.x;
if (idx >= N)
return;
- /* first thread initializes y */
+ /* First thread initializes y */
if (threadIdx.x == 0) {
for (i = 0; i < M; i++)
y[i] = 0;
}
for (i = idx; i < N; i += stridex) {
for (j = 0; j < M; j++) {
- /* transpose a */
+ /* Transpose A */
atrans[j * N + i] = a[i * M + j];
y[j] = atrans[j * M + i] * a[i * M + j] * x[j];
}
@@ -69,16 +72,16 @@ main(int argc, char *argv[])
srand(time(NULL));
- /*
- * use unified memory to avoid having additional device arrays and
- * memcpying from host to device and vice versa
+ /*
+ * Use unified memory to avoid having additional device arrays and
+ * memcpying from host to device and vice versa.
*/
cudaMallocManaged(&a, DIM * sizeof(float));
cudaMallocManaged(&atrans, DIM * sizeof(float));
cudaMallocManaged(&x, M * sizeof(float));
cudaMallocManaged(&y, M * sizeof(float));
- /* initialize array */
+ /* Initialize arrays */
for (i = 0; i < N; i++) {
x[i] = (float)(rand() % 100);
for (j = 0; j < M; j++)
@@ -86,7 +89,7 @@ main(int argc, char *argv[])
}
transnorm<<<NBLK, BLKSIZE>>>(a, atrans, x, y);
- /* wait for all devices to finish */
+ /* Wait for all devices to finish */
cudaDeviceSynchronize();
pretty_print_2d(a, "A", N, M);
diff --git a/c_cuda_parallel_systems/ex2/res/ex2a_1.png b/c_cuda_parallel_systems/ex2/res/ex2a_1.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2a_2.png b/c_cuda_parallel_systems/ex2/res/ex2a_2.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2a_3.png b/c_cuda_parallel_systems/ex2/res/ex2a_3.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_a_1.png b/c_cuda_parallel_systems/ex2/res/ex2b_a_1.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_a_2.png b/c_cuda_parallel_systems/ex2/res/ex2b_a_2.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_a_3.png b/c_cuda_parallel_systems/ex2/res/ex2b_a_3.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_b_1.png b/c_cuda_parallel_systems/ex2/res/ex2b_b_1.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_b_2.png b/c_cuda_parallel_systems/ex2/res/ex2b_b_2.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_b_3.png b/c_cuda_parallel_systems/ex2/res/ex2b_b_3.png
Binary files differ.