Here is the CUDA code we will be using. It defines two vectors and adds them.
#include "stdio.h"
#include <sys/time.h>
#include <cuda.h>
#define N 1000
__global__ void add(int *a, int *b, int *c)
{
int tID = blockIdx.x;
if (tID < N)
{
c[tID] = a[tID] + b[tID];
}
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_b, N*sizeof(int));
cudaMalloc((void **) &dev_c, N*sizeof(int));
// Fill Arrays
for (int i = 0; i < N; i++)
{
a[i] = i,
b[i] = 1;
}
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
add<<<N,1>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++)
{
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
return 0;
}
Save the code as vector-add.cu
.
We will now compile and run the code using the following SLURM script cuda.sh
:
#!/bin/bash
#SBATCH --job-name=cuda-add # job name
#SBATCH --partition=peregrine-gpu # partition to which job should be submitted
#SBATCH --qos=gpu_debug # qos type
#SBATCH --nodes=1 # node count
#SBATCH --ntasks=1 # total number of tasks across all nodes
#SBATCH --cpus-per-task=1 # cpu-cores per task
#SBATCH --mem=4G # total memory per node
#SBATCH --gres=gpu:nvidia_a100_3g.39gb:1 # Request 1 GPU (A100 40GB)
#SBATCH --time=00:05:00 # wall time
module load cuda
nvcc vector-add.cu -o vector-add
srun vector-add
Submit the job as
sbatch cuda.sh
The result will be saved in a file named slurm-####.out
and should look like
0 + 1 = 1
1 + 1 = 2
2 + 1 = 3
3 + 1 = 4
4 + 1 = 5
5 + 1 = 6
6 + 1 = 7
---------
---------
996 + 1 = 997
997 + 1 = 998
998 + 1 = 999
999 + 1 = 1000