The environment that

CUDA:cuda-11
Driver: 460.67
OS: 5.10.18-1 – MANJARO
CMAKE: 3.19.5

The directory structure is as follows:

├ ─ ─ CMakeLists. TXT ├ ─ ─ the include │ └ ─ ─ sumMatrix. H ├ ─ ─ main. Cu └ ─ ─ the SRC ├ ─ ─ CMakeLists. TXT └ ─ ─ sumMatrix. Cu 2 directories, 5 filesCopy the code

Cuda function

For simplicity, CUDA implements a two-dimensional matrix addition, header file (include/ summatrix.h) :

#ifndef SUM_MATRIX_CU_H
#define SUM_MATRIX_CU_H
#include <cuda_runtime.h>
__global__ void sumMatrix(float *a, float *b, int nx, int ny);
#endif
Copy the code

Source file implementation SRC/summatrix.cu:

#include "sumMatrix.h"
__global__ void sumMatrix(float *a, float *b, int nx, int ny) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    int idy = threadIdx.x + blockDim.y * blockIdx.y;
    int index = idy * nx + idx;
    // printf("==> (%d,%d) threadidx:%d index:%d Current x:%.2f,y:%.2f\n",idx,idy,threadIdx.x,index,a[index],b[index]);
    if (index < nx && index < ny)
      a[index] = a[index] + b[index];
  }
Copy the code

Main function implementation: main.cu

#include <cstdlib>
#include "sumMatrix.h"
#include <stdio.h>
void initData(float *f, int size, float value) {
  for (int i = 0; i < size; i++)
    *(f + i) = value;
}

void check_data(float *a, int n) {
  for (int i = 0; i < n; i++)
    printf("Current :%.3f\n", *(a + i));
}
int main(a) {
  int dev = 0;
  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, dev);

  int nx = 1 << 5;
  int ny = 1 << 5;

  int nxy = nx * ny;
  int nBytes = nxy * sizeof(float);

  float *h_a, *h_b, *hostRef, *gpuRef;

  h_a = (float *)malloc(nBytes);
  h_b = (float *)malloc(nBytes);

  hostRef = (float *)malloc(nBytes);
  gpuRef = (float *)malloc(nBytes);

  initData(h_a, nx, 1.0 f);
  initData(h_b, ny, 2.0 f);

  memset(hostRef, 0, nBytes);
  memset(gpuRef, 0, nBytes);

  float *d_a, *d_b;
  cudaMalloc((void **)&d_a, nBytes);
  cudaMalloc((void **)&d_b, nBytes);

  cudaMemcpy(d_a, h_a, nBytes, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, nBytes, cudaMemcpyHostToDevice);

  int dimx = 32;
  int dimy = 32;
  dim3 block(dimx, dimy);
  dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
  printf("grid:(%d,%d),Block:(%d,%d)",grid.x,grid.y,block.x,block.y);

  sumMatrix<<<grid, block>>>(d_a, d_b, nx, ny);
  cudaMemcpy(gpuRef, d_a, nBytes, cudaMemcpyDeviceToHost);
  //check_data(gpuRef, 10);

  cudaFree(d_a);
  cudaFree(d_b);
  free(h_a);
  free(h_b);
}
Copy the code

Project CMakeLists file: cmakelists.txt

Cmake_minimum_required (VERSION 3.14) # Enable language support project(Matrix_Demo LANGUAGES CXX CUDA) if(CUDA_ENABLED) Enable_language (CUDA) endif() # Configure CUDA architecture. Set the arch set(CMAKE_CUDA_ARCHITECTURES 52 80) # directory where the library will be compiled add_subdirectory(SRC) Add_executable (main main.cu) # link to the generated library target_link_libraries(main matrix)Copy the code

Library configuration file SRC/cmakelists.txt:

include_directories(${CMAKE_SOURCE_DIR}/include)
file(GLOB CUDA_SRC ${CMAKE_SOURCE_DIR}/src/*.cu)
add_library(matrix ${CUDA_SRC})
Copy the code

Compile and run

mkdir build&&cmake ..

The CUDA Compiler Identification is NVIDIA 11.0.221 for Detecting CXX  compiler ABI info -- Detecting CXX compiler ABI info - done -- Check for working CXX compiler: /usr/bin/c++ - skipped -- Detecting CXX compile features -- Detecting CXX compile features - done -- Detecting CUDA compiler ABI info -- Detecting CUDA compiler ABI info - done -- Check for working CUDA compiler: /opt/cuda/bin/nvcc - skipped -- Detecting CUDA compile features -- Detecting CUDA compile features - done -- Configuring  done -- Generating done -- Build files have been written to: /home/bleedingfight/test/cudnn/buildCopy the code

compilemake -j16

[ 25%] Building CUDA object src/CMakeFiles/matrix.dir/sumMatrix.cu.o
[ 50%] Linking CUDA static library libmatrix.a
[ 50%] Built target matrix
Scanning dependencies of target main
[ 75%] Building CUDA object CMakeFiles/main.dir/main.cu.o
[100%] Linking CUDA executable main
[100%] Built target main

Copy the code

Run:./main

The grid (1, 1), Block (32, 32)Copy the code

mo4tech.com (Moment For Technology) is a global community with thousands techies from across the global hang out!Passionate technologists, be it gadget freaks, tech enthusiasts, coders, technopreneurs, or CIOs, you would find them all here.

CUDA programming basics

The environment that

The directory structure is as follows:

Cuda function

Compile and run

CUDA programming basics

The environment that

The directory structure is as follows:

Cuda function

Compile and run

Related Posts

Pytorch–Tensor

A little program that tests the age of the left and right brain. Is that real?

TensorFlowTensorFlow tutorial summary –Keras Machine Learning Basics ③ Text categorization using TF Hub