CUDA

Here is the CUDA code we will be using. It defines two vectors and adds them.


#include "stdio.h"
#include <sys/time.h>
#include <cuda.h>

#define N 1000
__global__ void add(int *a, int *b, int *c)
{
int tID = blockIdx.x;
if (tID < N)
{
c[tID] = a[tID] + b[tID];
}
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_b, N*sizeof(int));
cudaMalloc((void **) &dev_c, N*sizeof(int));
// Fill Arrays
for (int i = 0; i < N; i++)
{
a[i] = i,
b[i] = 1;
}
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
add<<<N,1>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++)
{
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}

return 0;
}

Save the code as vector-add.cu.

We will now compile and run the code using the following SLURM script cuda.sh:

#!/bin/bash

#SBATCH --job-name=cuda-add       		 # job name
#SBATCH --partition=peregrine-gpu 		 # partition to which job should be submitted
#SBATCH --qos=gpu_debug			  		 # qos type
#SBATCH --nodes=1                 		 # node count
#SBATCH --ntasks=1                		 # total number of tasks across all nodes
#SBATCH --cpus-per-task=1         		 # cpu-cores per task
#SBATCH --mem=4G                  		 # total memory per node
#SBATCH --gres=gpu:nvidia_a100_3g.39gb:1 # Request 1 GPU (A100 40GB)
#SBATCH --time=00:05:00 				 #  wall time

module load cuda
nvcc vector-add.cu -o vector-add
srun vector-add

Submit the job as

sbatch cuda.sh

The result will be saved in a file named slurm-####.out and should look like

0 + 1 = 1
1 + 1 = 2
2 + 1 = 3
3 + 1 = 4
4 + 1 = 5
5 + 1 = 6
6 + 1 = 7
---------
---------
996 + 1 = 997
997 + 1 = 998
998 + 1 = 999
999 + 1 = 1000