done - uni - University stuff

commit ffcdbe764a8b51912ea821f6c30fb6960a43dbf2
parent ee20f660a7120358db0a81a4f3e02f60f402c26f
Author: Christos Margiolis <christos@margiolis.net>
Date:   Tue, 31 Jan 2023 14:17:00 +0200

done

Diffstat:
A c_cuda_parallel_systems/ex2/doc.pdf  | 0 
A c_cuda_parallel_systems/ex2/doc.tex  | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A c_cuda_parallel_systems/ex2/doc.toc  | 24 ++++++++++++++++++++++++
M c_cuda_parallel_systems/ex2/ex2a.c  | 11 ++++++++++-
M c_cuda_parallel_systems/ex2/ex2b_a.cu  | 36 +++++++++++++++++++++++++++---------
M c_cuda_parallel_systems/ex2/ex2b_b.cu  | 25 ++++++++++++++-----------
A c_cuda_parallel_systems/ex2/res/ex2a_1.png  | 0 
A c_cuda_parallel_systems/ex2/res/ex2a_2.png  | 0 
A c_cuda_parallel_systems/ex2/res/ex2a_3.png  | 0 
A c_cuda_parallel_systems/ex2/res/ex2b_a_1.png  | 0 
A c_cuda_parallel_systems/ex2/res/ex2b_a_2.png  | 0 
A c_cuda_parallel_systems/ex2/res/ex2b_a_3.png  | 0 
A c_cuda_parallel_systems/ex2/res/ex2b_b_1.png  | 0 
A c_cuda_parallel_systems/ex2/res/ex2b_b_2.png  | 0 
A c_cuda_parallel_systems/ex2/res/ex2b_b_3.png  | 0

15 files changed, 182 insertions(+), 21 deletions(-)
diff --git a/c_cuda_parallel_systems/ex2/doc.pdf b/c_cuda_parallel_systems/ex2/doc.pdf
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/doc.tex b/c_cuda_parallel_systems/ex2/doc.tex
@@ -0,0 +1,107 @@
+\documentclass{article}
+\usepackage[utf8]{inputenc}
+\usepackage[greek,english]{babel}
+\usepackage{alphabeta}
+\usepackage{fancyhdr}
+\usepackage{listings}
+\usepackage{mathtools}
+\usepackage{xcolor}
+\usepackage{biblatex}
+\usepackage[left=1cm,right=1cm]{geometry}
+
+\lstset {
+        basicstyle=\ttfamily,
+        columns=fullflexible,
+        breaklines=true,
+        keepspaces=true,
+	showstringspaces=false
+}
+
+\title{Εργαστήριο Παράλληλων Συστημάτων - Εργασία 2}
+\author{Χρήστος Μαργιώλης}
+\date{Ιανουάριος 2023}
+
+\begin{document}
+
+\begin{titlepage}
+        \maketitle
+\end{titlepage}
+
+\renewcommand{\contentsname}{Περιεχόμενα}
+\tableofcontents
+\pagebreak
+
+\section{Προγράμματα}
+
+Οι κώδικες έχουν σχόλια μόνο στα σημεία που θεώρησα ότι μπορεί να προκύψει
+κάποιο «μπέρδεμα».
+
+\subsection{'Ασκηση 2Α}
+
+\subsubsection{Κώδικας}
+\lstinputlisting[language=C]{ex2a.c}
+\pagebreak
+\subsubsection{Ενδεικτικά τρεξίματα}
+
+\begin{lstlisting}
+usage: ./a.out nthreads n
+\end{lstlisting}
+
+Για \lstinline{nthreads = 2} και \lstinline{n = 10}: \\
+
+\includegraphics[width=\textwidth]{res/ex2a_1.png} \\
+
+Για \lstinline{nthreads = 8} και \lstinline{n = 100}: \\
+
+\includegraphics[width=\textwidth]{res/ex2a_2.png} \\
+
+Για \lstinline{nthreads = 16} και \lstinline{n = 1000000}. Λόγω του αριθμού των
+στοιχείων, το στιγμιότυπο δείχνει μόνο τον χρόνο υπολογισμού: \\
+
+\includegraphics[width=\textwidth]{res/ex2a_3.png} \\
+\pagebreak
+
+\subsection{'Ασκηση 2Β-Α}
+
+\subsubsection{Κώδικας}
+\lstinputlisting[language=C]{ex2b_a.cu}
+\pagebreak
+\subsubsection{Ενδεικτικά τρεξίματα}
+
+Για \lstinline{NxN = 8x8} και \lstinline{blocksize = 256} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_a_1.png} \\
+
+Για \lstinline{NxN = 32x32} και \lstinline{blocksize = 1024} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_a_2.png} \\
+
+Για \lstinline{NxN = 4x4} και \lstinline{blocksize = 256} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_a_3.png} \\
+\pagebreak
+
+\subsection{'Ασκηση 2Β-Β}
+
+\subsubsection{Κώδικας}
+\lstinputlisting[language=C]{ex2b_b.cu}
+\pagebreak
+\subsubsection{Ενδεικτικά τρεξίματα}
+
+Για \lstinline{NxM = 4x2} και \lstinline{blocksize = 256} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_b_1.png} \\
+
+Για \lstinline{NxM = 4x8} και \lstinline{blocksize = 256} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_b_2.png} \\
+
+Για \lstinline{NxM = 8x8} και \lstinline{blocksize = 1024} \\
+
+\includegraphics[width=\textwidth]{res/ex2b_b_3.png} \\
+
+\section{Προβλήματα}
+
+Δεν υλοποίησα την άσκηση 2B-Γ (συνδιακύμανση).
+
+\end{document}
diff --git a/c_cuda_parallel_systems/ex2/doc.toc b/c_cuda_parallel_systems/ex2/doc.toc
@@ -0,0 +1,24 @@
+\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax 
+\babel@toc {english}{}
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {1}Προγράμματα}{2}{}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.1}'Ασκηση 2Α}{2}{}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{\numberline {1.1.1}Κώδικας}{2}{}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{\numberline {1.1.2}Ενδεικτικά τρεξίματα}{6}{}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.2}'Ασκηση 2Β-Α}{7}{}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{\numberline {1.2.1}Κώδικας}{7}{}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{\numberline {1.2.2}Ενδεικτικά τρεξίματα}{10}{}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsection}{\numberline {1.3}'Ασκηση 2Β-Β}{13}{}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{\numberline {1.3.1}Κώδικας}{13}{}%
+\defcounter {refsection}{0}\relax 
+\contentsline {subsubsection}{\numberline {1.3.2}Ενδεικτικά τρεξίματα}{16}{}%
+\defcounter {refsection}{0}\relax 
+\contentsline {section}{\numberline {2}Προβλήματα}{18}{}%
diff --git a/c_cuda_parallel_systems/ex2/ex2a.c b/c_cuda_parallel_systems/ex2/ex2a.c
@@ -11,7 +11,7 @@ static void	merge(int *, int *, int *, int *, int *);
 static void	multisort(int *, int *, int);
 
 /*
- * Print the contents of a 2D array like:
+ * Print the contents of an array like:
  *
  * array = [x, y, z]
  */
@@ -35,6 +35,9 @@ cmpfunc(const void *a, const void *b)
 	return (*(int *)a - *(int *)b);
 }
 
+/*
+ * Merge sort
+ */
 static void
 merge(int *a, int *enda, int *b, int *endb, int *res)
 {
@@ -55,6 +58,10 @@ multisort(int *arr, int *space, int n)
 {
 	int quarter, *sta, *spa, *stb, *spb, *stc, *spc, *std, *spd;
 
+	/*
+	 * Sort with qsort(3) directly if we can't split the array into 4
+	 * quarters.
+	 */
 	if ((quarter = n / 4) < 4)
 		qsort(arr, n, sizeof(int), cmpfunc);
 	else {
@@ -67,6 +74,7 @@ multisort(int *arr, int *space, int n)
 		spc = spb + quarter;
 		std = stc + quarter;
 		spd = spc + quarter;
+		/* Sort each quarter */
 #pragma omp task
 		multisort(sta, spa, quarter);
 #pragma omp task
@@ -115,6 +123,7 @@ main(int argc, char *argv[])
 	for (i = 0; i < n; i++)
 		a[i] = rand() % 100;
 
+	/* Calculate speed up */
 	start = omp_get_wtime();
 
 	pretty_print(a, n, "A_unsorted");
diff --git a/c_cuda_parallel_systems/ex2/ex2b_a.cu b/c_cuda_parallel_systems/ex2/ex2b_a.cu
@@ -3,8 +3,14 @@
 
 #define N	(1 << 2)
 #define DIM	(N * N)
+/*
+ * This formula for calculating the number of blocks is mentioned at "out of
+ * the blocks" section in:
+ *
+ * https://developer.nvidia.com/blog/even-easier-introduction-cuda/
+ */
 #define BLKSIZE	(1 << 8)
-#define NBLK	((N + BLKSIZE - 1) / BLKSIZE)
+#define NBLK	((DIM + BLKSIZE - 1) / BLKSIZE)
 
 __global__ void
 convolution(float *a, float *aconv)
@@ -12,10 +18,16 @@ convolution(float *a, float *aconv)
 	float c11, c12, c13, c21, c22, c23, c31, c32, c33;
 	int i, j, x, stridex;
 
-	/* each thread gets a slice of the rows to work with */
+	/*
+	 * Each thread gets a slice of the rows to work with. Grid-stride idiom
+	 * mentioned at section "out of the blocks" in:
+	 *
+	 * https://developer.nvidia.com/blog/even-easier-introduction-cuda/
+	 */
 	x = blockIdx.x * blockDim.x + threadIdx.x;
 	stridex = blockDim.x * gridDim.x;
 
+	/* Random weight values */
 	c11 = +0.2;  c21 = +0.5;  c31 = -0.8;
 	c12 = -0.3;  c22 = +0.6;  c32 = -0.9;
 	c13 = +0.4;  c23 = +0.7;  c33 = +0.10;
@@ -24,6 +36,7 @@ convolution(float *a, float *aconv)
 		return;
 	for (i = x; i < N - 1; i += stridex) {
 		for (j = 1; j < N - 1; j++) {
+			/* Taken from the lab's example code. */
 			aconv[i * N + j] = 
 			    c11 * a[(i - 1)	* N + (j - 1)] +
 			    c12 * a[i		* N + (j - 1)] +
@@ -48,7 +61,7 @@ min_diagonal(float *arr, float *min_arr)
 
 	if (x >= N)
 		return;
-	/* calculate local minimums */
+	/* Calculate local minimums */
 	min_arr[x] = arr[x * N + x];
 	for (i = x; i < N; i += stridex)
 		if (arr[i * N + i] < min_arr[x])
@@ -79,26 +92,31 @@ main(int argc, char *argv[])
 
 	srand(time(NULL));
 
-	/* 
-	 * use unified memory to avoid having additional device arrays and
-	 * memcpying from host to device and vice versa
+	/*
+	 * Use unified memory to avoid having additional device arrays and
+	 * memcpying from host to device and vice versa.
+	 *
+	 * https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
 	 */
 	cudaMallocManaged(&a, DIM * sizeof(float));
 	cudaMallocManaged(&aconv, DIM * sizeof(float));
 	cudaMallocManaged(&min_arr, DIM * sizeof(float));
 
-	/* initialize array */
+	/* Initialize array */
 	for (i = 0; i < DIM; i++)
 		a[i] = (float)(rand() % 100);
 
 	convolution<<<NBLK, BLKSIZE>>>(a, aconv);
-	/* wait for all devices to finish */
+	/* Wait for all devices to finish */
 	cudaDeviceSynchronize();
 
 	min_diagonal<<<NBLK, BLKSIZE>>>(aconv, min_arr);
 	cudaDeviceSynchronize();
 
-	/* find global minimum */
+	/*
+	 * Find global minimum using the local minimums calculated in
+	 * min_diagonal().
+	 */
 	min = min_arr[0];
 	for (i = 0; i < N; i++)
 		if (min_arr[i] < min)
diff --git a/c_cuda_parallel_systems/ex2/ex2b_b.cu b/c_cuda_parallel_systems/ex2/ex2b_b.cu
@@ -1,31 +1,34 @@
 #include <stdio.h>
 #include <time.h>
 
-#define N	(1 << 2)
-#define M	(1 << 1)
+#define N	(1 << 3)
+#define M	(1 << 3)
 #define DIM	(N * M)
-#define BLKSIZE	(1 << 8)
+#define BLKSIZE	(1 << 10)
 #define NBLK	((DIM + BLKSIZE - 1) / BLKSIZE)
 
+/*
+ * Calculations taken from lab's example code.
+ */
 __global__ void
 transnorm(float *a, float *atrans, float *x, float *y)
 {
 	int i, j, idx, stridex;
 
-	/* each thread gets a slice of the rows to work with */
+	/* Each thread gets a slice of the rows to work with */
 	idx = blockIdx.x * blockDim.x + threadIdx.x;
 	stridex = blockDim.x * gridDim.x;
 
 	if (idx >= N)
 		return;
-	/* first thread initializes y */
+	/* First thread initializes y */
 	if (threadIdx.x == 0) {
 		for (i = 0; i < M; i++)
 			y[i] = 0;
 	}
 	for (i = idx; i < N; i += stridex) {
 		for (j = 0; j < M; j++) {
-			/* transpose a */
+			/* Transpose A */
 			atrans[j * N + i] = a[i * M + j];
 			y[j] = atrans[j * M + i] * a[i * M + j] * x[j];
 		}
@@ -69,16 +72,16 @@ main(int argc, char *argv[])
 
 	srand(time(NULL));
 
-	/* 
-	 * use unified memory to avoid having additional device arrays and
-	 * memcpying from host to device and vice versa
+	/*
+	 * Use unified memory to avoid having additional device arrays and
+	 * memcpying from host to device and vice versa.
 	 */
 	cudaMallocManaged(&a, DIM * sizeof(float));
 	cudaMallocManaged(&atrans, DIM * sizeof(float));
 	cudaMallocManaged(&x, M * sizeof(float));
 	cudaMallocManaged(&y, M * sizeof(float));
 
-	/* initialize array */
+	/* Initialize arrays */
 	for (i = 0; i < N; i++) {
 		x[i] = (float)(rand() % 100);
 		for (j = 0; j < M; j++)
@@ -86,7 +89,7 @@ main(int argc, char *argv[])
 	}
 
 	transnorm<<<NBLK, BLKSIZE>>>(a, atrans, x, y);
-	/* wait for all devices to finish */
+	/* Wait for all devices to finish */
 	cudaDeviceSynchronize();
 
 	pretty_print_2d(a, "A", N, M);
diff --git a/c_cuda_parallel_systems/ex2/res/ex2a_1.png b/c_cuda_parallel_systems/ex2/res/ex2a_1.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2a_2.png b/c_cuda_parallel_systems/ex2/res/ex2a_2.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2a_3.png b/c_cuda_parallel_systems/ex2/res/ex2a_3.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_a_1.png b/c_cuda_parallel_systems/ex2/res/ex2b_a_1.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_a_2.png b/c_cuda_parallel_systems/ex2/res/ex2b_a_2.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_a_3.png b/c_cuda_parallel_systems/ex2/res/ex2b_a_3.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_b_1.png b/c_cuda_parallel_systems/ex2/res/ex2b_b_1.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_b_2.png b/c_cuda_parallel_systems/ex2/res/ex2b_b_2.png
Binary files differ.
diff --git a/c_cuda_parallel_systems/ex2/res/ex2b_b_3.png b/c_cuda_parallel_systems/ex2/res/ex2b_b_3.png
Binary files differ.

	uni University stuff
	git clone git://git.margiolis.net/uni.git
	Log \| Files \| Refs \| README \| LICENSE

A	c_cuda_parallel_systems/ex2/doc.pdf	\|	0
A	c_cuda_parallel_systems/ex2/doc.tex	\|	107	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	c_cuda_parallel_systems/ex2/doc.toc	\|	24	++++++++++++++++++++++++
M	c_cuda_parallel_systems/ex2/ex2a.c	\|	11	++++++++++-
M	c_cuda_parallel_systems/ex2/ex2b_a.cu	\|	36	+++++++++++++++++++++++++++---------
M	c_cuda_parallel_systems/ex2/ex2b_b.cu	\|	25	++++++++++++++-----------
A	c_cuda_parallel_systems/ex2/res/ex2a_1.png	\|	0
A	c_cuda_parallel_systems/ex2/res/ex2a_2.png	\|	0
A	c_cuda_parallel_systems/ex2/res/ex2a_3.png	\|	0
A	c_cuda_parallel_systems/ex2/res/ex2b_a_1.png	\|	0
A	c_cuda_parallel_systems/ex2/res/ex2b_a_2.png	\|	0
A	c_cuda_parallel_systems/ex2/res/ex2b_a_3.png	\|	0
A	c_cuda_parallel_systems/ex2/res/ex2b_b_1.png	\|	0
A	c_cuda_parallel_systems/ex2/res/ex2b_b_2.png	\|	0
A	c_cuda_parallel_systems/ex2/res/ex2b_b_3.png	\|	0