ubuntu18.04 nvidia显卡GPU编程

我思故我在 · 2019-10-08 09:14:00

1. 安装nvidia开发工具
https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&target_distro=Ubuntu&target_version=1804&target_type=deblocal

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget http://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-ubuntu1804-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb
sudo dpkg -i cuda-repo-ubuntu1804-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb
sudo apt-key add /var/cuda-repo-10-1-local-10.1.243-418.87.00/7fa2af80.pub
sudo apt-get update
sudo apt-get -y install cuda

2. 测试代码:
https://linuxhint.com/gpu-programming-cpp/
https://eslinux.com/programacion-gpu-cuda/

gpu-example.cu:

#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cstdio>
#include <chrono>

typedef std::chrono::high_resolution_clock Clock;

#define ITER 65535

// CPU version of the vector add function
void vector_add_cpu(int *a, int *b, int *c, int n) {
    int i;

    // Add the vector elements a and b to the vector c
    for (i = 0; i < n; ++i) {
    c[i] = a[i] + b[i];
    }
}

// GPU version of the vector add function
__global__ void vector_add_gpu(int *gpu_a, int *gpu_b, int *gpu_c, int n) {
    int i = threadIdx.x;
    // No for loop needed because the CUDA runtime
    // will thread this ITER times
    gpu_c[i] = gpu_a[i] + gpu_b[i];
}

int main() {

    int *a, *b, *c;
    int *gpu_a, *gpu_b, *gpu_c;

    a = (int *)malloc(ITER * sizeof(int));
    b = (int *)malloc(ITER * sizeof(int));
    c = (int *)malloc(ITER * sizeof(int));

    // We need variables accessible to the GPU,
    // so cudaMallocManaged provides these
    cudaMallocManaged(&gpu_a, ITER * sizeof(int));
    cudaMallocManaged(&gpu_b, ITER * sizeof(int));
    cudaMallocManaged(&gpu_c, ITER * sizeof(int));

    for (int i = 0; i < ITER; ++i) {
        a[i] = i;
        b[i] = i;
        c[i] = i;
    }

    // Call the CPU function and time it
    auto cpu_start = Clock::now();
    vector_add_cpu(a, b, c, ITER);
    auto cpu_end = Clock::now();
    std::cout << "vector_add_cpu: "
    << std::chrono::duration_cast<std::chrono::nanoseconds>(cpu_end - cpu_start).count()
    << " nanoseconds.\n";

    // Call the GPU function and time it
    // The triple angle brakets is a CUDA runtime extension that allows
    // parameters of a CUDA kernel call to be passed.
    // In this example, we are passing one thread block with ITER threads.
    auto gpu_start = Clock::now();
    vector_add_gpu <<<1, ITER>>> (gpu_a, gpu_b, gpu_c, ITER);
    cudaDeviceSynchronize();
    auto gpu_end = Clock::now();
    std::cout << "vector_add_gpu: "
    << std::chrono::duration_cast<std::chrono::nanoseconds>(gpu_end - gpu_start).count()
    << " nanoseconds.\n";

    // Free the GPU-function based memory allocations
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);

    // Free the CPU-function based memory allocations
    free(a);
    free(b);
    free(c);

    return 0;
}

Makefile:

INC=-I/usr/local/cuda/include
NVCC=/usr/local/cuda/bin/nvcc
NVCC_OPT=-std=c++11

all:
    $(NVCC) $(NVCC_OPT) gpu-example.cu-o gpu-example

clean:
    -rm -f gpu-example

3. 运行:

gpu-test$ ./gpu-example 
vector_add_cpu: 1370132 nanoseconds.
vector_add_gpu: 6445 nanoseconds.

最近编辑记录我思故我在 (2019-10-08 09:21:32)

我思故我在 · 2019-10-08 09:20:41

链接: C ++中的CUDA GPU编程教程

引言

在本指南中，我们将探讨使用C ++编程的GPU的强大功能。
开发人员可以期望使用C ++具有令人难以置信的性能，
并且使用低级语言访问GPU的强大功能可以生成一些当前可用的最快的计算。

要求条件

虽然任何能够运行现代版本Linux的计算机都可以支持C ++编译器，
但您将需要基于NVIDIA的显卡。如果没有GPU，
则可以使用Amazon Web Services或您喜欢的任何其他云服务提供商来激活实例。

如果决定使用物理机，请确保已安装专有的NVDIA驱动程序。您可以在以下位置找到有关此信息：
https://linuxhint.com/install-nvidia-drivers-linux/

除了驱动程序，您还需要CUDA工具包。在此示例中，我们将使用Ubuntu 16.04 LTS，
但是大多数主要Linux发行版都有可供下载的版本，可以在以下链接中找到：
https://developer.nvidia.com/cuda-downloads

对于Ubuntu，您应该选择下载.deb文件。下载的文件没有扩展名.deb，但是您可以轻松地重命名它。
然后，您可以使用以下命令进行安装：

sudo dpkg -i package-name.deb
可能会询问您是否要安装GPG密钥，如果要安装，请按照提供的说明进行安装。

完成后，更新您的存储库：

sudo apt-get update
sudo apt-get install cuda -y
完成后，建议您重新启动系统以确保所有内容均已正确加载。

GPU开发的好处

CPU处理许多不同的输入和输出，并且包含各种功能，
不仅可以处理各种必需的程序，而且还可以管理各种硬件配置。
它们还处理内存，缓存，系统总线，分段和I / O功能，使它们成为所有任务的服务器。

GPU相反，它们包含许多单独的处理器，它们专注于非常简单的数学函数。
因此，它们处理任务的速度比CPU快许多倍。通过专门研究标量函数（接受一个或多个输入并返回单个输出的函数），
它们以牺牲极度专业化为代价来实现极高的性能。

样例代码

在此示例中，我们将添加两个向量。添加一个带有CPU的版本和一个带有GPU的版本以进行速度比较。
gpu-example.cu 文件包含以下代码：

#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cstdio>
#include <chrono>
 
typedef std::chrono::high_resolution_clock Clock;
 
#define ITER 65535
 
// Version CPU de la función suma de vectores
void vector_add_cpu(int *a, int *b, int *c, int n) {
    int i;
 
    // Add the vector elements a and b to the vector c
    for (i = 0; i < n; ++i) {
    c[i] = a[i] + b[i];
    }
}
 
// Versión GPU de la función suma de vectores
__global__ void vector_add_gpu(int *gpu_a, int *gpu_b, int *gpu_c, int n) {
    int i = threadIdx.x;
    // No es necesario el loop for por que el runtime de CUDA
    // maneja estos hilos ITER veces
    gpu_c[i] = gpu_a[i] + gpu_b[i];
}
 
int main() {
 
    int *a, *b, *c;
    int *gpu_a, *gpu_b, *gpu_c;
 
    a = (int *)malloc(ITER * sizeof(int));
    b = (int *)malloc(ITER * sizeof(int));
    c = (int *)malloc(ITER * sizeof(int));
 
    // Necesitamos variables accesibles en CUDA,
    // para eso cudaMallocManaged nos las provee
    cudaMallocManaged(&gpu_a, ITER * sizeof(int));
    cudaMallocManaged(&gpu_b, ITER * sizeof(int));
    cudaMallocManaged(&gpu_c, ITER * sizeof(int));
 
    for (int i = 0; i < ITER; ++i) {
        a[i] = i;
        b[i] = i;
        c[i] = i;
    }
 
    // Llama a la versión CPU y la temporiza
    auto cpu_start = Clock::now();
    vector_add_cpu(a, b, c, ITER);
    auto cpu_end = Clock::now();
    std::cout << "vector_add_cpu: "
    << std::chrono::duration_cast<std::chrono::nanoseconds>(cpu_end - cpu_start).count()
    << " nanoseconds.\n";
 
    // Llama a la versión GPU y la temporiza
    // Los triples <> es una extensión del runtime CUDA que permite
    // que los parametros de una llamada al kernel CUDA sean pasados
    // En este ejemplo estamos pasando un thread block con ITER threads
    auto gpu_start = Clock::now();
    vector_add_gpu <<<1, ITER>>> (gpu_a, gpu_b, gpu_c, ITER);
    cudaDeviceSynchronize();
    auto gpu_end = Clock::now();
    std::cout << "vector_add_gpu: "
    << std::chrono::duration_cast<std::chrono::nanoseconds>(gpu_end - gpu_start).count()
    << " nanoseconds.\n";
 
    // Libere la memoria basada en la función GPU allocations
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);
 
    // Libere la memoria basada en la función CPU allocations
    free(a);
    free(b);
    free(c);
 
    return 0;
}

Makefile包含：

INC=-I/usr/local/cuda/include
NVCC=/usr/local/cuda/bin/nvcc
NVCC_OPT=-std=c++11
 
all:
    $(NVCC) $(NVCC_OPT) gpu-example.cu-o gpu-example
 
clean:
    -rm -f gpu-example

要运行该示例，请首先编译：
make

然后运行程序：
./gpu-example

如您所见，CPU版本的运行速度明显慢于GPU版本。

如果不是，则必须将gpu-example.cu 中定义的ITER设置为更高的数字。
这是因为GPU的配置时间比一些占用大量CPU的较小循环更长。
我发现65535在我的机器上可以正常工作，但是其里程可能会有所不同。
但是，一旦清除此阈值，GPU就会比CPU快得多。

结论

希望您从我们的C ++ GPU编程入门中学到很多。前面的示例并没有太大的成就，
但是演示的概念提供了一个框架，您可以使用该框架合并您的想法以释放GPU的功能。

最近编辑记录我思故我在 (2019-10-08 09:21:19)

我思故我在 · 2019-10-08 09:22:09

一楼的代码测试通过。

cuda 入门教程:
https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial01/
https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial02/

我思故我在 · 2019-10-08 22:00:31

一楼那个toolkit安装或许可以用这个命令代替： sudo apt install nvidia-cuda-toolkit

WhyCan Forum(哇酷开发者社区)

楼主 #1 2019-10-08 09:14:00 分享评论