Profile GPU Codes from AQUA Cluster

While is easier to code and profile on local machine. It involves some work on remote machines and ever more work on remote clusters. Helped Smit profiling on Tensor cores codes on V100.
While profiling from a remote cluster (multiple hops) is challenging But we have a solution!
Always us -lineinfo while compiling nvcc. Use -arch=sm_86 or -arch=sm_70 for tensors.
nsys profile ./a.out to profile
Export the generated files *.nsys-rep file locally and import into Nsight Compute.
Visualise and analyse.
alt text
Learn-ings

qdstrm is an intermediate result one needs to use QdstrmImporter to convert to qdrep file. I’ll leave it here. No time to explaing. Below log is self-explanatory if you are working on AQUA cluster.
Raw logs

 cs16d003  aqua  SSH  ~  temp  $  ll
total 29473
-rw-r--r-- 1 cs16d003 rupeshgrp      628 Feb 14 03:53 a.cu
-rwxr-xr-x 1 cs16d003 rupeshgrp   756848 Feb 14 03:53 a.out
-rw------- 1 cs16d003 rupeshgrp     1573 Feb 14 03:53 cmdsErrorLog.err
-rw------- 1 cs16d003 rupeshgrp     4336 Feb 14 03:53 cmdsOutputLog.log
-rw-r--r-- 1 cs16d003 rupeshgrp 29920832 Feb 14 03:53 report1.qdstrm
-rw-r--r-- 1 cs16d003 rupeshgrp   329926 Feb 14 03:53 report2.qdrep // This was generated.
-rwxr-xr-x 1 cs16d003 rupeshgrp      573 Feb 14 03:53 run.sh

 cs16d003  aqua  SSH  ~  temp  $  qsub run.sh
1225506.hn1

 cs16d003  aqua  SSH  ~  temp  $  cat run.sh 
#!/bin/bash
#PBS -e cmdsErrorLog.err
#PBS -o cmdsOutputLog.log
#PBS -q rupesh_gpuq
#PBS -l select=1:ncpus=1:ngpus=1

tpdir=`echo $PBS_JOBID | cut -f 1 -d .`
tempdir=$HOME/scratch/job$tpdir
mkdir -p $tempdir
cd $tempdir
cp -R $PBS_O_WORKDIR/* .
module load cuda11.4
module load gcc640
echo ==========
nvcc --version
echo ==========
nvidia-smi
echo ==========
pwd
cat /proc/driver/nvidia/version
echo ==========
nvcc a.cu
./a.out 
echo ==========
nsys --version
nsys profile ./a.out
echo ==========
#./sssp market test_mst.mtx 
#rm a.out
mv * $PBS_O_WORKDIR/.
rmdir $tempdir
 cs16d003  aqua  SSH  ~  temp  $  cat a.cu 
#include <cuda.h>
#include <stdio.h>

#define gpuErrchk(ans) \
    { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line,
                          bool abort = true) {
    if (code != cudaSuccess) {
          fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
                          line);
              if (abort) exit(code);
                }
}

__global__ void k() {
    printf("hello %u!\n", threadIdx.x);
}

int main(void) {
    k<<<3, 32>>>();
      cudaDeviceSynchronize();
        gpuErrchk(cudaPeekAtLastError());
          return 0;
}

 
 cs16d003  aqua  SSH  ~  temp  $  ls -lrt
total 2041
-rw-r--r-- 1 cs16d003 rupeshgrp   1483 Feb 14 11:52 b.cu
-rwxr-xr-x 1 cs16d003 rupeshgrp 756848 Feb 14 11:52 a.out
-rw-r--r-- 1 cs16d003 rupeshgrp    628 Feb 14 11:52 a.cu
-rwxr-xr-x 1 cs16d003 rupeshgrp 760936 Feb 14 11:52 b.out
-rwxr-xr-x 1 cs16d003 rupeshgrp    612 Feb 14 12:45 run.sh
 cs16d003  aqua  SSH  ~  temp  $  qsub run.sh 
1225693.hn1
 cs16d003  aqua  SSH  ~  temp  $  ll
total 2600
-rw-r--r-- 1 cs16d003 rupeshgrp    628 Feb 14 11:55 a.cu
-rwxr-xr-x 1 cs16d003 rupeshgrp 756848 Feb 14 11:55 a.out
-rw-r--r-- 1 cs16d003 rupeshgrp   1483 Feb 14 11:55 b.cu
-rwxr-xr-x 1 cs16d003 rupeshgrp 760936 Feb 14 11:55 b.out
-rw------- 1 cs16d003 rupeshgrp   1573 Feb 14 11:55 cmdsErrorLog.err
-rw------- 1 cs16d003 rupeshgrp   3014 Feb 14 11:55 cmdsOutputLog.log
-rw-r--r-- 1 cs16d003 rupeshgrp 329386 Feb 14 11:55 report1.qdrep
-rwxr-xr-x 1 cs16d003 rupeshgrp    612 Feb 14 11:55 run.sh
 cs16d003  aqua  SSH  ~  temp  $  vim b.cu 
 cs16d003  aqua  SSH  ~  temp  $  ll
total 2600
-rw-r--r-- 1 cs16d003 rupeshgrp    628 Feb 14 11:55 a.cu
-rwxr-xr-x 1 cs16d003 rupeshgrp 756848 Feb 14 11:55 a.out
-rw-r--r-- 1 cs16d003 rupeshgrp   1483 Feb 14 11:55 b.cu
-rwxr-xr-x 1 cs16d003 rupeshgrp 760936 Feb 14 11:55 b.out
-rw------- 1 cs16d003 rupeshgrp   1573 Feb 14 11:55 cmdsErrorLog.err
-rw------- 1 cs16d003 rupeshgrp   3014 Feb 14 11:55 cmdsOutputLog.log
-rw-r--r-- 1 cs16d003 rupeshgrp 329386 Feb 14 11:55 report1.qdrep
-rwxr-xr-x 1 cs16d003 rupeshgrp    612 Feb 14 11:55 run.sh
 cs16d003  aqua  SSH  ~  temp  $  cat cmdsErrorLog.err 
 
Warning: LBR backtrace method is not supported on this platform. DWARF backtrace method will be used.
Collecting data...
Processing events...
Saving temporary "/var/tmp/pbs.1225693.hn1/nsys-report-5497-a5fc-aa58-cf91.qdstrm" file to disk...

Creating final output files...
Processing [===============================================================100%]
Saved report file to "/var/tmp/pbs.1225693.hn1/nsys-report-5497-a5fc-aa58-cf91.qdrep"
Report file moved to "/lfs1/usrscratch/phd/cs16d003/job1225693/report1.qdrep"

 cs16d003  aqua  SSH  ~  temp  $  cat cmdsOutputLog.log 
==========
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Aug_15_21:14:11_PDT_2021
Cuda compilation tools, release 11.4, V11.4.120
Build cuda_11.4.r11.4/compiler.30300941_0
==========
Wed Feb 14 11:55:19 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla V100-PCIE...  Off  | 00000000:5C:00.0 Off |                    0 |
| N/A   32C    P0    23W / 250W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:D8:00.0 Off |                    0 |
| N/A   32C    P0    25W / 250W |    175MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    1   N/A  N/A     24847      G   /usr/bin/X                         63MiB |
|    1   N/A  N/A     24919      G   /usr/bin/gnome-shell              110MiB |
+-----------------------------------------------------------------------------+
==========
/lfs/usrhome/phd/cs16d003/scratch/job1225693
NVRM version: NVIDIA UNIX x86_64 Kernel Module  470.57.02  Tue Jul 13 16:14:05 UTC 2021
GCC version:  gcc version 4.8.5 20150623 (Red Hat 4.8.5-36) (GCC) 
==========
Results:
2.00 * 0.84 + 3.00 = 4.68
2.00 * 0.39 + 3.00 = 3.79
2.00 * 0.78 + 3.00 = 4.57
2.00 * 0.80 + 3.00 = 4.60
2.00 * 0.91 + 3.00 = 4.82
2.00 * 0.20 + 3.00 = 3.40
2.00 * 0.34 + 3.00 = 3.67
2.00 * 0.77 + 3.00 = 4.54
2.00 * 0.28 + 3.00 = 3.56
2.00 * 0.55 + 3.00 = 4.11
==========
NVIDIA Nsight Systems version 2021.3.2.4-027534f
Results:
2.00 * 0.84 + 3.00 = 4.68
2.00 * 0.39 + 3.00 = 3.79
2.00 * 0.78 + 3.00 = 4.57
2.00 * 0.80 + 3.00 = 4.60
2.00 * 0.91 + 3.00 = 4.82
2.00 * 0.20 + 3.00 = 3.40
2.00 * 0.34 + 3.00 = 3.67
2.00 * 0.77 + 3.00 = 4.54
2.00 * 0.28 + 3.00 = 3.56
2.00 * 0.55 + 3.00 = 4.11
==========
 cs16d003  aqua  SSH  ~  temp  $  cat run.sh 
#!/bin/bash
#PBS -e cmdsErrorLog.err
#PBS -o cmdsOutputLog.log
#PBS -q rupesh_gpuq
#PBS -l select=1:ncpus=1:ngpus=1

tpdir=`echo $PBS_JOBID | cut -f 1 -d .`
tempdir=$HOME/scratch/job$tpdir
mkdir -p $tempdir
cd $tempdir
cp -R $PBS_O_WORKDIR/* .
module load cuda11.4
module load gcc640
echo ==========
nvcc --version
echo ==========
nvidia-smi
echo ==========
pwd
cat /proc/driver/nvidia/version
echo ==========
#nvcc a.cu
#./a.out 
nvcc b.cu -o b.out -lineinfo
./b.out
echo ==========
nsys --version
nsys profile ./b.out
echo ==========
#./sssp market test_mst.mtx 
#rm a.out
mv * $PBS_O_WORKDIR/.
rmdir $tempdir
 cs16d003  aqua  SSH  ~  temp  $  cat b.cu 
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define N 1000000
#define BLOCK_SIZE 256

__global__ void ax_plus_b(float *x, float *result, float a, float b) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        result[idx] = a * x[idx] + b;
    }
}

int main() {
    float *h_x, *h_result, *d_x, *d_result;
    float a = 2.0f, b = 3.0f; // Constants a and b
    int size = N * sizeof(float);

    // Allocate memory on the host
    h_x = (float *)malloc(size);
    h_result = (float *)malloc(size);

    // Generate random numbers on the CPU
    for (int i = 0; i < N; ++i) {
        h_x[i] = rand() / (float)RAND_MAX;
    }

    // Allocate memory on the device
    cudaMalloc((void **)&d_x, size);
    cudaMalloc((void **)&d_result, size);

    // Copy input data from host to device
    cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice);

    // Launch kernel
    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    ax_plus_b<<<numBlocks, BLOCK_SIZE>>>(d_x, d_result, a, b);
    ax_plus_b<<<numBlocks, BLOCK_SIZE>>>(d_x, d_result, a, b);

    // Copy result from device to host
    cudaMemcpy(h_result, d_result, size, cudaMemcpyDeviceToHost);

    // Verify the results
    printf("Results:\n");
    for (int i = 0; i < 10; ++i) {
        printf("%.2f * %.2f + %.2f = %.2f\n", a, h_x[i], b, h_result[i]);
    }

    // Free memory
    free(h_x);
    free(h_result);
    cudaFree(d_x);
    cudaFree(d_result);

    return 0;
}

 
Tue, 16-Feb-2024, 14:55:46 IST
★ 8 min read · Rajesh Pandian M · 16-Apr-2024 14:38:14 (IST) ★ GPU , aqua , profile